sparrow 1.3.0
Loading...
Searching...
No Matches
/home/runner/work/sparrow/sparrow/include/sparrow/layout/array_registry.hpp

Registry for array factories supporting base types and extensions.

Registry for array factories supporting base types and extensions.This registry provides a centralized mechanism for creating array instances from arrow_proxy objects. It supports:

  1. Base types: All fundamental Arrow data types (primitives, lists, structs, etc.)
  2. Extensions: Arrow extension types that override base type behavior based on metadata
  3. Dispatch: Type-safe visitor pattern for polymorphic array operations

The registry follows a two-tier lookup strategy:

Extension types are identified by the "ARROW:extension:name" metadata key.

// Register a custom extension auto& registry = array_registry::instance(); registry.register_extension( data_type::BINARY, "my.custom.type", [](arrow_proxy proxy) { return cloning_ptr<array_wrapper>{ new array_wrapper_impl<my_custom_array>(my_custom_array(std::move(proxy))) }; } );

// Use the factory auto arr_wrapper = array_factory(some_proxy); // Automatically dispatches to right type

// Use dispatch to visit the array with type safety auto result = registry.dispatch([](auto&& array) { return array.size(); }, *arr_wrapper);

// Copyright 2024 Man Group Operations Limited
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <array>
#include <functional>
#include <limits>
#include <stdexcept>
#include <string_view>
#include <type_traits>
#include <unordered_map>
#include <utility>
#include <vector>
// Array type includes for dispatch support
namespace sparrow
{
// Primary template - undefined for unsupported types
// Specializations are defined in array_registry.hpp after all array types are included
template <data_type DT>
struct array_type_map;
// Helper alias
template <data_type DT>
using array_type_t = typename array_type_map<DT>::type;
// Dictionary encoding type map (for integer types)
template <data_type DT>
struct dictionary_key_type;
template <data_type DT>
using dictionary_key_t = typename dictionary_key_type<DT>::type;
// Timestamp types with/without timezone
template <data_type DT>
struct timestamp_type_map;
// List of all supported data types - single source of truth
inline constexpr std::array all_data_types = {
};
// clang-format off
// Template specializations for array_type_map - defined here after all array includes
template <> struct array_type_map<data_type::NA> { using type = null_array; };
template <> struct array_type_map<data_type::BOOL> { using type = primitive_array<bool>; };
template <> struct array_type_map<data_type::UINT8> { using type = primitive_array<std::uint8_t>; };
template <> struct array_type_map<data_type::INT8> { using type = primitive_array<std::int8_t>; };
template <> struct array_type_map<data_type::UINT16> { using type = primitive_array<std::uint16_t>; };
template <> struct array_type_map<data_type::INT16> { using type = primitive_array<std::int16_t>; };
template <> struct array_type_map<data_type::UINT32> { using type = primitive_array<std::uint32_t>; };
template <> struct array_type_map<data_type::INT32> { using type = primitive_array<std::int32_t>; };
template <> struct array_type_map<data_type::UINT64> { using type = primitive_array<std::uint64_t>; };
template <> struct array_type_map<data_type::INT64> { using type = primitive_array<std::int64_t>; };
template <> struct array_type_map<data_type::HALF_FLOAT> { using type = primitive_array<float16_t>; };
template <> struct array_type_map<data_type::FLOAT> { using type = primitive_array<float32_t>; };
template <> struct array_type_map<data_type::DOUBLE> { using type = primitive_array<float64_t>; };
template <> struct array_type_map<data_type::STRING> { using type = string_array; };
template <> struct array_type_map<data_type::STRING_VIEW> { using type = string_view_array; };
template <> struct array_type_map<data_type::LARGE_STRING> { using type = big_string_array; };
template <> struct array_type_map<data_type::BINARY> { using type = binary_array; };
template <> struct array_type_map<data_type::BINARY_VIEW> { using type = binary_view_array; };
template <> struct array_type_map<data_type::LARGE_BINARY> { using type = big_binary_array; };
template <> struct array_type_map<data_type::LIST> { using type = list_array; };
template <> struct array_type_map<data_type::LARGE_LIST> { using type = big_list_array; };
template <> struct array_type_map<data_type::LIST_VIEW> { using type = list_view_array; };
template <> struct array_type_map<data_type::LARGE_LIST_VIEW> { using type = big_list_view_array; };
template <> struct array_type_map<data_type::FIXED_SIZED_LIST> { using type = fixed_sized_list_array; };
template <> struct array_type_map<data_type::STRUCT> { using type = struct_array; };
template <> struct array_type_map<data_type::MAP> { using type = map_array; };
template <> struct array_type_map<data_type::RUN_ENCODED> { using type = run_end_encoded_array; };
template <> struct array_type_map<data_type::DENSE_UNION> { using type = dense_union_array; };
template <> struct array_type_map<data_type::SPARSE_UNION> { using type = sparse_union_array; };
template <> struct array_type_map<data_type::DECIMAL32> { using type = decimal_32_array; };
template <> struct array_type_map<data_type::DECIMAL64> { using type = decimal_64_array; };
template <> struct array_type_map<data_type::DECIMAL128> { using type = decimal_128_array; };
template <> struct array_type_map<data_type::DECIMAL256> { using type = decimal_256_array; };
template <> struct array_type_map<data_type::FIXED_WIDTH_BINARY> { using type = fixed_width_binary_array; };
template <> struct array_type_map<data_type::DATE_DAYS> { using type = date_days_array; };
template <> struct array_type_map<data_type::DATE_MILLISECONDS> { using type = date_milliseconds_array; };
template <> struct array_type_map<data_type::TIMESTAMP_SECONDS> { using type = timestamp_seconds_array; };
template <> struct array_type_map<data_type::TIMESTAMP_MILLISECONDS> { using type = timestamp_milliseconds_array; };
template <> struct array_type_map<data_type::TIMESTAMP_MICROSECONDS> { using type = timestamp_microseconds_array; };
template <> struct array_type_map<data_type::TIMESTAMP_NANOSECONDS> { using type = timestamp_nanoseconds_array; };
template <> struct array_type_map<data_type::DURATION_SECONDS> { using type = duration_seconds_array; };
template <> struct array_type_map<data_type::DURATION_MILLISECONDS> { using type = duration_milliseconds_array; };
template <> struct array_type_map<data_type::DURATION_MICROSECONDS> { using type = duration_microseconds_array; };
template <> struct array_type_map<data_type::DURATION_NANOSECONDS> { using type = duration_nanoseconds_array; };
template <> struct array_type_map<data_type::INTERVAL_MONTHS> { using type = months_interval_array; };
template <> struct array_type_map<data_type::INTERVAL_DAYS_TIME> { using type = days_time_interval_array; };
template <> struct array_type_map<data_type::INTERVAL_MONTHS_DAYS_NANOSECONDS> { using type = month_day_nanoseconds_interval_array; };
template <> struct array_type_map<data_type::TIME_SECONDS> { using type = time_seconds_array; };
template <> struct array_type_map<data_type::TIME_MILLISECONDS> { using type = time_milliseconds_array; };
template <> struct array_type_map<data_type::TIME_MICROSECONDS> { using type = time_microseconds_array; };
template <> struct array_type_map<data_type::TIME_NANOSECONDS> { using type = time_nanoseconds_array; };
// Dictionary key type specializations
template <> struct dictionary_key_type<data_type::UINT8> { using type = std::uint8_t; };
template <> struct dictionary_key_type<data_type::INT8> { using type = std::int8_t; };
template <> struct dictionary_key_type<data_type::UINT16> { using type = std::uint16_t; };
template <> struct dictionary_key_type<data_type::INT16> { using type = std::int16_t; };
template <> struct dictionary_key_type<data_type::UINT32> { using type = std::uint32_t; };
template <> struct dictionary_key_type<data_type::INT32> { using type = std::int32_t; };
template <> struct dictionary_key_type<data_type::UINT64> { using type = std::uint64_t; };
template <> struct dictionary_key_type<data_type::INT64> { using type = std::int64_t; };
// Timestamp type specializations (with/without timezone)
template <> struct timestamp_type_map<data_type::TIMESTAMP_SECONDS> {
using with_tz = timestamp_seconds_array;
};
template <> struct timestamp_type_map<data_type::TIMESTAMP_MILLISECONDS> {
};
template <> struct timestamp_type_map<data_type::TIMESTAMP_MICROSECONDS> {
};
template <> struct timestamp_type_map<data_type::TIMESTAMP_NANOSECONDS> {
using with_tz = timestamp_nanoseconds_array;
};
// clang-format on
class array_registry
{
public:
using factory_func = std::function<cloning_ptr<array_wrapper>(arrow_proxy)>;
using extension_predicate = std::function<bool(const arrow_proxy&)>;
template <class F>
using visit_result_t = std::invoke_result_t<F, null_array>;
[[nodiscard]] SPARROW_API static array_registry& instance();
array_registry(const array_registry&) = delete;
array_registry& operator=(const array_registry&) = delete;
array_registry(array_registry&&) = delete;
array_registry& operator=(array_registry&&) = delete;
SPARROW_API void register_base_type(data_type dt, factory_func factory);
register_extension(data_type base_type, std::string_view extension_name, factory_func factory);
register_extension(data_type base_type, extension_predicate predicate, factory_func factory);
[[nodiscard]] SPARROW_API cloning_ptr<array_wrapper> create(arrow_proxy proxy) const;
template <class F>
[[nodiscard]] visit_result_t<F> dispatch(F&& func, const array_wrapper& ar) const;
private:
array_registry();
// Helper for dispatching with compile-time type knowledge
template <class F, data_type DT>
static auto dispatch_for_type(F&& func, const array_wrapper& ar) -> visit_result_t<F>
{
{
// Special handling for timestamp types with timezone check
using types = timestamp_type_map<DT>;
if (get_timezone(ar.get_arrow_proxy()) == nullptr)
{
return std::invoke(std::forward<F>(func), unwrap_array<typename types::without_tz>(ar));
}
else
{
return std::invoke(std::forward<F>(func), unwrap_array<typename types::with_tz>(ar));
}
}
else
{
return std::invoke(std::forward<F>(func), unwrap_array<array_type_t<DT>>(ar));
}
}
template <class F>
struct invoker
{
template <data_type DT>
static auto run(F&& func, const array_wrapper& ar) -> visit_result_t<F>
{
return dispatch_for_type<F, DT>(std::forward<F>(func), ar);
}
};
template <class F>
static consteval auto make_dispatch_table()
{
using result_t = visit_result_t<F>;
using invoker_t = result_t (*)(F&&, const array_wrapper&);
return []<std::size_t... I>(std::index_sequence<I...>)
{
return std::array<invoker_t, all_data_types.size()>{
&invoker<F>::template run<all_data_types[I]>...
};
}(std::make_index_sequence<all_data_types.size()>{});
}
// Helper method for dispatching base types
template <class F>
[[nodiscard]] visit_result_t<F>
dispatch_base_type(F&& func, const array_wrapper& ar, data_type dt) const;
struct extension_entry
{
extension_entry(extension_predicate pred, factory_func fact)
: predicate(std::move(pred))
, factory(std::move(fact))
{
}
factory_func factory;
// Helper to check if this extension matches a wrapper
[[nodiscard]] bool matches(const array_wrapper& wrapper) const;
};
// Base type factories indexed by data_type
std::unordered_map<data_type, factory_func> m_base_factories;
// Extensions indexed by base data_type
std::unordered_map<data_type, std::vector<extension_entry>> m_extensions;
[[nodiscard]] static bool has_extension_name(const arrow_proxy& proxy, std::string_view extension_name);
};
} // namespace sparrow
namespace sparrow
{
// ========== Template implementations ==========
template <class F>
inline auto array_registry::dispatch(F&& func, const array_wrapper& ar) const -> visit_result_t<F>
{
// Handle dictionary encoding first
if (ar.is_dictionary())
{
switch (ar.data_type())
{
return func(unwrap_array<dictionary_encoded_array<std::uint8_t>>(ar));
return func(unwrap_array<dictionary_encoded_array<std::int8_t>>(ar));
return func(unwrap_array<dictionary_encoded_array<std::uint16_t>>(ar));
return func(unwrap_array<dictionary_encoded_array<std::int16_t>>(ar));
return func(unwrap_array<dictionary_encoded_array<std::uint32_t>>(ar));
return func(unwrap_array<dictionary_encoded_array<std::int32_t>>(ar));
return func(unwrap_array<dictionary_encoded_array<std::uint64_t>>(ar));
return func(unwrap_array<dictionary_encoded_array<std::int64_t>>(ar));
default:
throw std::runtime_error("data type of dictionary encoded array must be an integer");
}
}
const auto dt = ar.data_type();
return dispatch_base_type(std::forward<F>(func), ar, dt);
}
template <class F>
inline auto array_registry::dispatch_base_type(F&& func, const array_wrapper& ar, data_type dt) const
-> visit_result_t<F>
{
static constexpr auto table = make_dispatch_table<F>();
return table[static_cast<std::size_t>(dt)](std::forward<F>(func), ar);
}
// Standalone visit function for backward compatibility
template <class F>
[[nodiscard]] inline auto visit(F&& func, const array_wrapper& ar) -> std::invoke_result_t<F, null_array>
{
return array_registry::instance().dispatch(std::forward<F>(func), ar);
}
} // namespace sparrow
void sparse_union_array()
void struct_array()
SPARROW_API void register_extension(data_type base_type, std::string_view extension_name, factory_func factory)
Register an extension type factory.
static SPARROW_API array_registry & instance()
Get the singleton registry instance.
array_registry & operator=(const array_registry &)=delete
std::function< cloning_ptr< array_wrapper >(arrow_proxy)> factory_func
Factory function type that creates an array_wrapper from an arrow_proxy.
std::invoke_result_t< F, null_array > visit_result_t
Visitor result type alias.
SPARROW_API void register_base_type(data_type dt, factory_func factory)
Register a base type factory.
std::function< bool(const arrow_proxy &)> extension_predicate
Extension predicate that checks if a proxy matches an extension type.
SPARROW_API cloning_ptr< array_wrapper > create(arrow_proxy proxy) const
Create an array wrapper from an arrow_proxy.
visit_result_t< F > dispatch(F &&func, const array_wrapper &ar) const
Base class for array type erasure.
#define SPARROW_API
Definition config.hpp:38
binary_array_impl< std::int64_t > big_binary_array
Type alias for variable-size binary arrays with 64-bit offsets.
time_array< chrono::time_microseconds > time_microseconds_array
A time array for std::chrono::time_microseconds values.
constexpr std::array all_data_types
time_array< chrono::time_milliseconds > time_milliseconds_array
A time array for std::chrono::time_milliseconds values.
decimal_array< decimal< int128_t > > decimal_128_array
Type alias for 128-bit decimal array.
duration_array< std::chrono::seconds > duration_seconds_array
A duration array for std::chrono::seconds values.
list_array_impl< false > list_array
A list array implementation.
decimal_array< decimal< int32_t > > decimal_32_array
Type alias for 32-bit decimal array.
decimal_array< decimal< int64_t > > decimal_64_array
Type alias for 64-bit decimal array.
time_array< chrono::time_nanoseconds > time_nanoseconds_array
A time array for std::chrono::time_nanoseconds values.
typename dictionary_key_type< DT >::type dictionary_key_t
timestamp_without_timezone_array< zoned_time_without_timezone_seconds > timestamp_without_timezone_seconds_array
A timestamp without timezone array for zoned_time_without_timezone_seconds values.
date_array< date_milliseconds > date_milliseconds_array
A date array for date_milliseconds values.
decimal_array< decimal< int256_t > > decimal_256_array
Type alias for 256-bit decimal array.
string_array_impl< std::int64_t > big_string_array
Type alias for variable-size string arrays with 64-bit offsets.
list_view_array_impl< true > big_list_view_array
typename array_type_map< DT >::type array_type_t
date_array< date_days > date_days_array
A date array for date_days values.
auto visit(F &&func, const array_wrapper &ar) -> std::invoke_result_t< F, null_array >
string_array_impl< std::int32_t > string_array
Type alias for variable-size string arrays with 32-bit offsets.
SPARROW_API const date::time_zone * get_timezone(const arrow_proxy &proxy)
interval_array< month_day_nanoseconds_interval > month_day_nanoseconds_interval_array
An interval array for month_day_nanoseconds_interval values.
time_array< chrono::time_seconds > time_seconds_array
A time array for std::chrono::time_seconds values.
timestamp_array< timestamp_second > timestamp_seconds_array
Type aliases for timestamp arrays with common durations.
fixed_width_binary_array_impl< fixed_width_binary_traits::value_type, fixed_width_binary_traits::const_reference > fixed_width_binary_array
list_view_array_impl< false > list_view_array
A list view array implementation.
T & unwrap_array(array_wrapper &)
variable_size_binary_view_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference > string_view_array
A variable-size string view layout implementation.
timestamp_array< timestamp_nanosecond > timestamp_nanoseconds_array
primitive_array_impl< T, Ext, T2 > primitive_array
Array of values of whose type has fixed binary size.
list_array_impl< true > big_list_array
A big list array implementation.
interval_array< days_time_interval > days_time_interval_array
An interval array for days_time_interval values.
timestamp_without_timezone_array< zoned_time_without_timezone_nanoseconds > timestamp_without_timezone_nanoseconds_array
A timestamp without timezone array for zoned_time_without_timezone_nanoseconds values.
timestamp_without_timezone_array< zoned_time_without_timezone_milliseconds > timestamp_without_timezone_milliseconds_array
A timestamp without timezone array for zoned_time_without_timezone_milliseconds values.
binary_array_impl< std::int32_t > binary_array
Type alias for variable-size binary arrays with 32-bit offsets.
interval_array< chrono::months > months_interval_array
An interval array for std::chrono::months values.
duration_array< std::chrono::microseconds > duration_microseconds_array
A duration array for std::chrono::microseconds values.
duration_array< std::chrono::milliseconds > duration_milliseconds_array
A duration array for std::chrono::milliseconds values.
variable_size_binary_view_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference > binary_view_array
A variable-size binary view layout implementation.
duration_array< std::chrono::nanoseconds > duration_nanoseconds_array
A duration array for std::chrono::nanoseconds values.
timestamp_without_timezone_array< zoned_time_without_timezone_microseconds > timestamp_without_timezone_microseconds_array
A timestamp without timezone array for zoned_time_without_timezone_microseconds values.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
timestamp_array< timestamp_microsecond > timestamp_microseconds_array
timestamp_array< timestamp_millisecond > timestamp_milliseconds_array