sparrow 1.0.0
Loading...
Searching...
No Matches
/home/runner/work/sparrow/sparrow/include/sparrow/struct_array.hpp

Array implementation for storing structured data with named fields.

Array implementation for storing structured data with named fields. The struct_array provides a columnar storage format for structured data, where each struct element consists of multiple named fields (children arrays). This is similar to database records or C structs but optimized for analytical workloads with columnar memory layout.

Key features:

The Arrow struct layout stores:

Related Apache Arrow description and specification: https://arrow.apache.org/docs/dev/format/Intro.html#struct https://arrow.apache.org/docs/format/Columnar.html#struct-layout

Precondition
All child arrays must have the same length
Field names must be unique within the struct
Postcondition
Maintains Arrow struct format compatibility ("+s")
All child arrays remain synchronized in length
Thread-safe for read operations, requires external synchronization for writes
// Create child arrays for fields
primitive_array<int32_t> id_array({1, 2, 3});
primitive_array<std::string> name_array({"Alice", "Bob", "Charlie"});
// Create struct array from children
std::vector<array> children = {
std::move(id_array).with_name("id"),
std::move(name_array).with_name("name")
};
struct_array persons(std::move(children));
// Access struct elements
auto person = persons[0]; // Get struct_value
auto id_field = person["id"]; // Access field by name
void struct_array()
// Copyright 2024 Man Group Operations Limited
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <ranges>
#include <string_view>
#include <version>
#if defined(__cpp_lib_format)
#endif
namespace sparrow
{
class struct_array;
namespace detail
{
template <>
struct get_data_type_from_array<struct_array>
{
[[nodiscard]] static constexpr sparrow::data_type get()
{
}
};
}
template <>
struct array_inner_types<struct_array> : array_inner_types_base
{
using array_type = struct_array;
using inner_value_type = struct_value;
using inner_reference = struct_value;
using inner_const_reference = struct_value;
using value_iterator = functor_index_iterator<detail::layout_value_functor<array_type, inner_value_type>>;
using const_value_iterator = functor_index_iterator<
detail::layout_value_functor<const array_type, inner_value_type>>;
using iterator_tag = std::random_access_iterator_tag;
};
template <class T>
constexpr bool is_struc_array_v = std::same_as<T, struct_array>;
class struct_array final : public array_bitmap_base<struct_array>
{
public:
using inner_types = array_inner_types<self_type>;
using value_iterator = typename inner_types::value_iterator;
using const_value_iterator = typename inner_types::const_value_iterator;
using size_type = typename base_type::size_type;
using bitmap_type = typename base_type::bitmap_type;
using inner_value_type = struct_value;
using inner_reference = struct_value;
using inner_const_reference = struct_value;
using value_type = nullable<inner_value_type>;
using const_reference = nullable<inner_const_reference, bitmap_const_reference>;
SPARROW_API explicit struct_array(arrow_proxy proxy);
template <class... Args>
explicit struct_array(Args&&... args)
: struct_array(create_proxy(std::forward<Args>(args)...))
{
}
struct_array(struct_array&&) = default;
[[nodiscard]] SPARROW_API size_type children_count() const;
[[nodiscard]] SPARROW_API const array_wrapper* raw_child(std::size_t i) const;
[[nodiscard]] SPARROW_API array_wrapper* raw_child(std::size_t i);
private:
template <
std::ranges::input_range CHILDREN_RANGE,
validity_bitmap_input VB = validity_bitmap,
input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
requires std::same_as<std::ranges::range_value_t<CHILDREN_RANGE>, array>
[[nodiscard]] static auto create_proxy(
CHILDREN_RANGE&& children,
VB&& bitmaps,
std::optional<std::string_view> name = std::nullopt,
std::optional<METADATA_RANGE> metadata = std::nullopt
) -> arrow_proxy;
template <std::ranges::input_range CHILDREN_RANGE, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
requires std::same_as<std::ranges::range_value_t<CHILDREN_RANGE>, array>
[[nodiscard]] static auto create_proxy(
CHILDREN_RANGE&& children,
bool nullable = true,
std::optional<std::string_view> name = std::nullopt,
std::optional<METADATA_RANGE> metadata = std::nullopt
) -> arrow_proxy;
template <std::ranges::input_range CHILDREN_RANGE, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
requires std::same_as<std::ranges::range_value_t<CHILDREN_RANGE>, array>
[[nodiscard]] static auto create_proxy_impl(
CHILDREN_RANGE&& children,
std::optional<validity_bitmap>&& bitmap,
std::optional<std::string_view> name = std::nullopt,
std::optional<METADATA_RANGE> metadata = std::nullopt
) -> arrow_proxy;
using children_type = std::vector<cloning_ptr<array_wrapper>>;
[[nodiscard]] SPARROW_API value_iterator value_begin();
[[nodiscard]] SPARROW_API value_iterator value_end();
[[nodiscard]] SPARROW_API const_value_iterator value_cbegin() const;
[[nodiscard]] SPARROW_API const_value_iterator value_cend() const;
[[nodiscard]] SPARROW_API inner_reference value(size_type i);
[[nodiscard]] SPARROW_API inner_const_reference value(size_type i) const;
[[nodiscard]] SPARROW_API children_type make_children();
// data members
children_type m_children;
// friend classes
friend class array_crtp_base<self_type>;
// needs access to this->value(i)
friend class detail::layout_value_functor<self_type, inner_value_type>;
friend class detail::layout_value_functor<const self_type, inner_value_type>;
};
template <std::ranges::input_range CHILDREN_RANGE, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
requires std::same_as<std::ranges::range_value_t<CHILDREN_RANGE>, array>
auto struct_array::create_proxy(
CHILDREN_RANGE&& children,
VB&& validity_input,
std::optional<std::string_view> name,
std::optional<METADATA_RANGE> metadata
) -> arrow_proxy
{
const auto size = children.empty() ? 0 : children[0].size();
validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
return create_proxy_impl(
std::forward<CHILDREN_RANGE>(children),
std::move(vbitmap),
std::move(name),
std::move(metadata)
);
}
template <std::ranges::input_range CHILDREN_RANGE, input_metadata_container METADATA_RANGE>
requires std::same_as<std::ranges::range_value_t<CHILDREN_RANGE>, array>
auto struct_array::create_proxy(
CHILDREN_RANGE&& children,
bool nullable,
std::optional<std::string_view> name,
std::optional<METADATA_RANGE> metadata
) -> arrow_proxy
{
const size_t size = children.empty() ? 0 : children[0].size();
return create_proxy_impl(
std::forward<CHILDREN_RANGE>(children),
nullable ? std::make_optional<validity_bitmap>(nullptr, size) : std::nullopt,
std::move(name),
std::move(metadata)
);
}
template <std::ranges::input_range CHILDREN_RANGE, input_metadata_container METADATA_RANGE>
requires std::same_as<std::ranges::range_value_t<CHILDREN_RANGE>, array>
auto struct_array::create_proxy_impl(
CHILDREN_RANGE&& children,
std::optional<validity_bitmap>&& bitmap,
std::optional<std::string_view> name,
std::optional<METADATA_RANGE> metadata
) -> arrow_proxy
{
const auto n_children = children.size();
ArrowSchema** child_schemas = new ArrowSchema*[n_children];
ArrowArray** child_arrays = new ArrowArray*[n_children];
const auto size = children.empty() ? 0 : children[0].size();
for (std::size_t i = 0; i < n_children; ++i)
{
auto& child = children[i];
SPARROW_ASSERT_TRUE(child.size() == size);
auto [flat_arr, flat_schema] = extract_arrow_structures(std::move(child));
child_arrays[i] = new ArrowArray(std::move(flat_arr));
child_schemas[i] = new ArrowSchema(std::move(flat_schema));
}
const bool bitmap_has_value = bitmap.has_value();
const auto null_count = bitmap_has_value ? bitmap->null_count() : 0;
const auto flags = bitmap_has_value
? std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE})
: std::nullopt;
std::string("+s"), // format
std::move(name), // name
std::move(metadata), // metadata
flags, // flags,
child_schemas, // children
repeat_view<bool>(true, n_children), // children_ownership
nullptr, // dictionary
true // dictionary ownership
);
buffer<uint8_t> bitmap_buffer = bitmap_has_value ? std::move(*bitmap).extract_storage()
: buffer<uint8_t>{nullptr, 0};
std::vector<buffer<std::uint8_t>> arr_buffs(1);
arr_buffs[0] = std::move(bitmap_buffer);
static_cast<std::int64_t>(size), // length
static_cast<std::int64_t>(null_count), // null_count
0, // offset
std::move(arr_buffs),
child_arrays, // children
repeat_view<bool>(true, n_children), // children_ownership
nullptr, // dictionary
true // dictionary ownership
);
return arrow_proxy{std::move(arr), std::move(schema)};
}
}
#if defined(__cpp_lib_format)
template <>
struct std::formatter<sparrow::struct_array>
{
constexpr auto parse(std::format_parse_context& ctx)
{
return ctx.begin();
}
SPARROW_API auto format(const sparrow::struct_array& struct_array, std::format_context& ctx) const
-> decltype(ctx.out());
};
namespace sparrow
{
SPARROW_API std::ostream& operator<<(std::ostream& os, const struct_array& value);
}
#endif
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::iterator_tag iterator_tag
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
array_inner_types< self_type > inner_types
SPARROW_API const array_wrapper * raw_child(std::size_t i) const
Gets const pointer to child array at specified index.
nullable< inner_value_type > value_type
nullable< inner_const_reference, bitmap_const_reference > const_reference
struct_value inner_reference
base_type::iterator_tag iterator_tag
struct_value inner_value_type
struct_value inner_const_reference
SPARROW_API size_type children_count() const
Gets the number of child arrays (fields).
base_type::const_bitmap_range const_bitmap_range
SPARROW_API struct_array & operator=(const struct_array &rhs)
Copy assignment operator.
typename base_type::bitmap_const_reference bitmap_const_reference
typename inner_types::const_value_iterator const_value_iterator
array_bitmap_base< self_type > base_type
#define SPARROW_API
Definition config.hpp:38
#define SPARROW_ASSERT_TRUE(expr__)
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
array_bitmap_base_impl< D, false > array_bitmap_base
Convenient alias for arrays with immutable validity bitmaps.
constexpr bool is_struc_array_v
Type trait to check if a type is a struct_array.
std::pair< ArrowArray, ArrowSchema > extract_arrow_structures(A &&a)
Extracts the internal ArrowArray and ArrowSchema structures from the given array or typed layout.
Definition array.hpp:91
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
Type alias for a validity bitmap using 8-bit storage blocks.
std::ostream & operator<<(std::ostream &os, const nullval_t &)
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
Ensures a validity bitmap of the specified size from various input types.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.