sparrow 1.0.0
Loading...
Searching...
No Matches
/home/runner/work/sparrow/sparrow/include/sparrow/map_array.hpp

Map array implementation for storing key-value pairs in a list-like structure.

Map array implementation for storing key-value pairs in a list-like structure. This class implements an Arrow-compatible array for storing maps (associative arrays) where each element is a collection of key-value pairs. The implementation uses a struct array with two fields (keys and values) as the child array, combined with an offset buffer to delimit individual maps within the flat structure.

The map array follows the Apache Arrow Map specification: https://arrow.apache.org/docs/format/Columnar.html#map-layout

Precondition
Keys within each map should be unique (though not enforced)
Keys can optionally be sorted within each map (affects ArrowFlag::MAP_KEYS_SORTED)
Postcondition
Maintains Arrow Map format compatibility
Uses struct array with "key" and "value" fields as child array
Offset buffer has size = array_size + 1
// Create arrays for keys and values
primitive_array<std::string> keys({"a", "b", "c", "d"});
primitive_array<int> values({1, 2, 3, 4});
// Create offset buffer: map 0 has 2 items, map 1 has 2 items
std::vector<int32_t> offsets{0, 2, 4};
map_array arr(std::move(keys), std::move(values), std::move(offsets));
// Copyright 2024 Man Group Operations Limited
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace sparrow
{
class map_array;
template <>
struct array_inner_types<map_array> : array_inner_types_base
{
using array_type = map_array;
using inner_value_type = map_value;
using inner_reference = map_value;
using inner_const_reference = map_value;
using value_iterator = functor_index_iterator<detail::layout_value_functor<array_type, inner_value_type>>;
using const_value_iterator = functor_index_iterator<
detail::layout_value_functor<const array_type, inner_value_type>>;
using iterator_tag = std::random_access_iterator_tag;
};
template <class T>
constexpr bool is_map_array_v = std::same_as<T, map_array>;
namespace detail
{
template <>
struct get_data_type_from_array<sparrow::map_array>
{
[[nodiscard]] static constexpr sparrow::data_type get()
{
}
};
}
class map_array final : public array_bitmap_base<map_array>
{
public:
using inner_types = array_inner_types<self_type>;
using value_iterator = inner_types::value_iterator;
using const_value_iterator = inner_types::const_value_iterator;
using size_type = typename base_type::size_type;
using offset_type = const std::int32_t;
using offset_buffer_type = u8_buffer<std::remove_const_t<offset_type>>;
using bitmap_type = typename base_type::bitmap_type;
using inner_value_type = inner_types::inner_value_type;
using inner_reference = inner_types::inner_reference;
using inner_const_reference = inner_types::inner_const_reference;
using value_type = nullable<inner_value_type>;
using const_reference = nullable<inner_const_reference, bitmap_const_reference>;
SPARROW_API explicit map_array(arrow_proxy proxy);
template <class... Args>
explicit map_array(Args&&... args)
: self_type(create_proxy(std::forward<Args>(args)...))
{
}
map_array(map_array&&) noexcept = default;
map_array& operator=(map_array&&) noexcept = default;
[[nodiscard]] SPARROW_API const array_wrapper* raw_keys_array() const;
[[nodiscard]] SPARROW_API array_wrapper* raw_keys_array();
[[nodiscard]] SPARROW_API const array_wrapper* raw_items_array() const;
[[nodiscard]] SPARROW_API array_wrapper* raw_items_array();
template <std::ranges::range SIZES_RANGE>
[[nodiscard]] static auto offset_from_sizes(SIZES_RANGE&& sizes) -> offset_buffer_type;
private:
[[nodiscard]] SPARROW_API value_iterator value_begin();
[[nodiscard]] SPARROW_API value_iterator value_end();
[[nodiscard]] SPARROW_API const_value_iterator value_cbegin() const;
[[nodiscard]] SPARROW_API const_value_iterator value_cend() const;
[[nodiscard]] SPARROW_API inner_reference value(size_type i);
[[nodiscard]] SPARROW_API inner_const_reference value(size_type i) const;
[[nodiscard]] SPARROW_API offset_type* make_list_offsets() const;
[[nodiscard]] SPARROW_API cloning_ptr<array_wrapper> make_entries_array() const;
[[nodiscard]] SPARROW_API bool get_keys_sorted() const;
[[nodiscard]] SPARROW_API static bool
check_keys_sorted(const array& flat_keys, const offset_buffer_type& offsets);
template <
validity_bitmap_input VB = validity_bitmap,
input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
[[nodiscard]] static arrow_proxy create_proxy(
array&& flat_keys,
array&& flat_items,
offset_buffer_type&& list_offsets,
VB&& validity_input,
std::optional<std::string_view> name = std::nullopt,
std::optional<METADATA_RANGE> metadata = std::nullopt
);
template <
validity_bitmap_input VB = validity_bitmap,
std::ranges::input_range OFFSET_BUFFER_RANGE,
input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
requires std::convertible_to<std::ranges::range_value_t<OFFSET_BUFFER_RANGE>, offset_type>
[[nodiscard]] static arrow_proxy create_proxy(
array&& flat_keys,
array&& flat_items,
OFFSET_BUFFER_RANGE&& list_offsets_range,
VB&& validity_input,
std::optional<std::string_view> name = std::nullopt,
std::optional<METADATA_RANGE> metadata = std::nullopt
)
{
offset_buffer_type list_offsets{std::move(list_offsets_range)};
return map_array::create_proxy(
std::move(flat_keys),
std::move(flat_items),
std::move(list_offsets),
std::forward<VB>(validity_input),
std::forward<std::optional<std::string_view>>(name),
std::forward<std::optional<METADATA_RANGE>>(metadata)
);
}
/*
* @brief Creates Arrow proxy from keys, values, offsets, and nullable flag.
*
* @tparam METADATA_RANGE Type of metadata container
* @param flat_keys Array containing all map keys
* @param flat_values Array containing all map values
* @param list_offsets Buffer of offsets indicating map boundaries
* @param nullable Whether the array should support null values
* @param name Optional name for the array
* @param metadata Optional metadata for the array
* @return Arrow proxy containing the map array data and schema
*
* @pre flat_keys must be a valid array
* @pre flat_values must be a valid array
* @pre flat_keys.size() must equal flat_values.size()
* @pre list_offsets.size() must be >= 1
* @pre Last offset must not exceed flat_keys.size()
* @post If nullable is true, array supports null values (though none initially set)
* @post If nullable is false, array does not support null values
* @post Keys sorted flag is determined from actual key ordering
*/
template <
validity_bitmap_input VB = validity_bitmap,
input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
[[nodiscard]] static arrow_proxy create_proxy(
array&& flat_keys,
array&& flat_values,
offset_buffer_type&& list_offsets,
bool nullable = true,
std::optional<std::string_view> name = std::nullopt,
std::optional<METADATA_RANGE> metadata = std::nullopt
);
template <
validity_bitmap_input VB = validity_bitmap,
std::ranges::input_range OFFSET_BUFFER_RANGE,
input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
requires std::convertible_to<std::ranges::range_value_t<OFFSET_BUFFER_RANGE>, offset_type>
[[nodiscard]] static arrow_proxy create_proxy(
array&& flat_keys,
array&& flat_items,
OFFSET_BUFFER_RANGE&& list_offsets_range,
bool nullable = true,
std::optional<std::string_view> name = std::nullopt,
std::optional<METADATA_RANGE> metadata = std::nullopt
)
{
offset_buffer_type list_offsets{std::move(list_offsets_range)};
return map_array::create_proxy(
std::move(flat_keys),
std::move(flat_items),
std::move(list_offsets),
nullable,
std::forward<std::optional<std::string_view>>(name),
std::forward<std::optional<METADATA_RANGE>>(metadata)
);
}
static constexpr std::size_t OFFSET_BUFFER_INDEX = 1;
offset_type* p_list_offsets;
cloning_ptr<array_wrapper> p_entries_array;
bool m_keys_sorted;
// friend classes
friend class array_crtp_base<map_array>;
friend class detail::layout_value_functor<const map_array, map_value>;
};
template <std::ranges::range SIZES_RANGE>
auto map_array::offset_from_sizes(SIZES_RANGE&& sizes) -> offset_buffer_type
{
std::forward<SIZES_RANGE>(sizes)
);
}
template <validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
arrow_proxy map_array::create_proxy(
array&& flat_keys,
array&& flat_items,
offset_buffer_type&& list_offsets,
VB&& validity_input,
std::optional<std::string_view> name,
std::optional<METADATA_RANGE> metadata
)
{
const auto size = list_offsets.size() - 1;
validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
std::optional<std::unordered_set<ArrowFlag>> flags{{ArrowFlag::NULLABLE}};
bool keys_sorted = check_keys_sorted(flat_keys, list_offsets);
if (keys_sorted)
{
flags.value().insert(ArrowFlag::MAP_KEYS_SORTED);
}
std::array<sparrow::array, 2> struct_children = {std::move(flat_keys), std::move(flat_items)};
struct_array entries(std::move(struct_children), false, std::string("entries"));
auto [entries_arr, entries_schema] = extract_arrow_structures(std::move(entries));
const auto null_count = vbitmap.null_count();
const repeat_view<bool> children_ownership{true, 1};
std::string("+m"),
name, // name
metadata, // metadata
flags, // flags,
new ArrowSchema*[1]{new ArrowSchema(std::move(entries_schema))},
children_ownership, // children ownership
nullptr, // dictionary
true // dictionary ownership
);
std::vector<buffer<std::uint8_t>> arr_buffs = {
std::move(vbitmap).extract_storage(),
std::move(list_offsets).extract_storage()
};
static_cast<std::int64_t>(size), // length
static_cast<std::int64_t>(null_count),
0, // offset
std::move(arr_buffs),
new ArrowArray*[1]{new ArrowArray(std::move(entries_arr))},
children_ownership, // children ownership
nullptr, // dictionary
true // dictionary ownership
);
return arrow_proxy{std::move(arr), std::move(schema)};
}
template <validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
arrow_proxy map_array::create_proxy(
array&& flat_keys,
array&& flat_items,
offset_buffer_type&& list_offsets,
bool nullable,
std::optional<std::string_view> name,
std::optional<METADATA_RANGE> metadata
)
{
if (nullable)
{
return map_array::create_proxy(
std::move(flat_keys),
std::move(flat_items),
std::move(list_offsets),
name,
metadata
);
}
else
{
bool keys_sorted = check_keys_sorted(flat_keys, list_offsets);
auto flags = keys_sorted
? std::optional<std::unordered_set<ArrowFlag>>{{ArrowFlag::MAP_KEYS_SORTED}}
: std::nullopt;
const auto size = list_offsets.size() - 1;
std::array<sparrow::array, 2> struct_children = {std::move(flat_keys), std::move(flat_items)};
struct_array entries(std::move(struct_children), false, std::string("entries"));
auto [entries_arr, entries_schema] = extract_arrow_structures(std::move(entries));
const repeat_view<bool> children_ownership{true, 1};
std::string_view("+m"),
name, // name
metadata, // metadata
flags, // flags,
new ArrowSchema*[1]{new ArrowSchema(std::move(entries_schema))},
children_ownership, // children ownership
nullptr, // dictionary
true // dictionary ownership
);
std::vector<buffer<std::uint8_t>> arr_buffs = {
buffer<std::uint8_t>{nullptr, 0}, // no validity bitmap
std::move(list_offsets).extract_storage()
};
static_cast<std::int64_t>(size), // length
0,
0, // offset
std::move(arr_buffs),
new ArrowArray*[1]{new ArrowArray(std::move(entries_arr))},
children_ownership, // children ownership
nullptr, // dictionary
true // dictionary ownership
);
return arrow_proxy{std::move(arr), std::move(schema)};
}
}
}
void struct_array()
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::iterator_tag iterator_tag
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
inner_types::inner_const_reference inner_const_reference
inner_types::const_value_iterator const_value_iterator
Definition map_array.hpp:95
typename base_type::iterator_tag iterator_tag
nullable< inner_value_type > value_type
SPARROW_API map_array & operator=(const self_type &rhs)
Copy assignment operator.
inner_types::inner_value_type inner_value_type
map_array self_type
Definition map_array.hpp:91
typename base_type::const_bitmap_range const_bitmap_range
SPARROW_API const array_wrapper * raw_items_array() const
Gets read-only access to the values array.
static auto offset_from_sizes(SIZES_RANGE &&sizes) -> offset_buffer_type
Creates offset buffer from map sizes.
SPARROW_API const array_wrapper * raw_keys_array() const
Gets read-only access to the keys array.
inner_types::inner_reference inner_reference
const std::int32_t offset_type
Definition map_array.hpp:97
u8_buffer< std::remove_const_t< offset_type > > offset_buffer_type
Definition map_array.hpp:98
array_bitmap_base< self_type > base_type
Definition map_array.hpp:92
SPARROW_API map_array(arrow_proxy proxy)
Constructs map array from Arrow proxy.
nullable< inner_const_reference, bitmap_const_reference > const_reference
array_inner_types< self_type > inner_types
Definition map_array.hpp:93
typename base_type::bitmap_const_reference bitmap_const_reference
#define SPARROW_API
Definition config.hpp:38
constexpr sparrow::u8_buffer< OFFSET_TYPE > offset_buffer_from_sizes(SIZES_RANGE &&sizes)
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
array_bitmap_base_impl< D, false > array_bitmap_base
Convenient alias for arrays with immutable validity bitmaps.
std::pair< ArrowArray, ArrowSchema > extract_arrow_structures(A &&a)
Extracts the internal ArrowArray and ArrowSchema structures from the given array or typed layout.
Definition array.hpp:91
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
Type alias for a validity bitmap using 8-bit storage blocks.
constexpr bool is_map_array_v
Definition map_array.hpp:44
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
Ensures a validity bitmap of the specified size from various input types.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.