Registry for array factories supporting base types and extensions.
Registry for array factories supporting base types and extensions.This registry provides a centralized mechanism for creating array instances from arrow_proxy objects. It supports:
Extension types are identified by the "ARROW:extension:name" metadata key.
// Register a custom extension auto& registry = array_registry::instance(); registry.register_extension( data_type::BINARY, "my.custom.type", [](arrow_proxy proxy) { return cloning_ptr<array_wrapper>{ new array_wrapper_impl<my_custom_array>(my_custom_array(std::move(proxy))) }; } );
// Use the factory auto arr_wrapper = array_factory(some_proxy); // Automatically dispatches to right type
// Use dispatch to visit the array with type safety auto result = registry.dispatch([](auto&& array) { return array.size(); }, *arr_wrapper);
#pragma once
#include <array>
#include <functional>
#include <limits>
#include <stdexcept>
#include <string_view>
#include <type_traits>
#include <unordered_map>
#include <utility>
#include <vector>
{
template <data_type DT>
struct array_type_map;
template <data_type DT>
template <data_type DT>
struct dictionary_key_type;
template <data_type DT>
template <data_type DT>
struct timestamp_type_map;
};
template <>
struct array_type_map<data_type::
NA> {
using type = null_array; };
template <>
struct array_type_map<data_type::
LIST> {
using type =
list_array; };
template <>
struct array_type_map<data_type::
FIXED_SIZED_LIST> {
using type = fixed_sized_list_array; };
template <>
struct array_type_map<data_type::
MAP> {
using type = map_array; };
template <>
struct array_type_map<data_type::
RUN_ENCODED> {
using type = run_end_encoded_array; };
template <>
struct array_type_map<data_type::
DENSE_UNION> {
using type = dense_union_array; };
template <>
struct dictionary_key_type<data_type::
UINT8> {
using type = std::uint8_t; };
template <>
struct dictionary_key_type<data_type::
INT8> {
using type = std::int8_t; };
template <>
struct dictionary_key_type<data_type::
UINT16> {
using type = std::uint16_t; };
template <>
struct dictionary_key_type<data_type::
INT16> {
using type = std::int16_t; };
template <>
struct dictionary_key_type<data_type::
UINT32> {
using type = std::uint32_t; };
template <>
struct dictionary_key_type<data_type::
INT32> {
using type = std::int32_t; };
template <>
struct dictionary_key_type<data_type::
UINT64> {
using type = std::uint64_t; };
template <>
struct dictionary_key_type<data_type::
INT64> {
using type = std::int64_t; };
};
};
};
};
class array_registry
{
public:
using factory_func = std::function<cloning_ptr<array_wrapper>(arrow_proxy)>;
template <class F>
array_registry(const array_registry&) = delete;
array_registry&
operator=(
const array_registry&) =
delete;
array_registry(array_registry&&) = delete;
array_registry&
operator=(array_registry&&) =
delete;
[[nodiscard]]
SPARROW_API cloning_ptr<array_wrapper>
create(arrow_proxy proxy)
const;
template <class F>
private:
array_registry();
template <class F, data_type DT>
static auto dispatch_for_type(F&& func,
const array_wrapper& ar) ->
visit_result_t<F>
{
{
using types = timestamp_type_map<DT>;
{
}
else
{
}
}
else
{
}
}
template <class F>
struct invoker
{
template <data_type DT>
{
return dispatch_for_type<F, DT>(std::forward<F>(func), ar);
}
};
template <class F>
static consteval auto make_dispatch_table()
{
using invoker_t = result_t (*)(F&&, const array_wrapper&);
return []<std::size_t... I>(std::index_sequence<I...>)
{
&invoker<F>::template run<all_data_types[I]>...
};
}
template <class F>
dispatch_base_type(F&& func, const array_wrapper& ar, data_type dt) const;
struct extension_entry
{
: predicate(std::move(pred))
, factory(std::move(fact))
{
}
[[nodiscard]] bool matches(const array_wrapper& wrapper) const;
};
std::unordered_map<data_type, factory_func> m_base_factories;
std::unordered_map<data_type, std::vector<extension_entry>> m_extensions;
[[nodiscard]] static bool has_extension_name(const arrow_proxy& proxy, std::string_view extension_name);
};
}
{
template <class F>
{
if (ar.is_dictionary())
{
switch (ar.data_type())
{
return func(
unwrap_array<dictionary_encoded_array<std::uint8_t>>(ar));
return func(
unwrap_array<dictionary_encoded_array<std::int8_t>>(ar));
return func(
unwrap_array<dictionary_encoded_array<std::uint16_t>>(ar));
return func(
unwrap_array<dictionary_encoded_array<std::int16_t>>(ar));
return func(
unwrap_array<dictionary_encoded_array<std::uint32_t>>(ar));
return func(
unwrap_array<dictionary_encoded_array<std::int32_t>>(ar));
return func(
unwrap_array<dictionary_encoded_array<std::uint64_t>>(ar));
return func(
unwrap_array<dictionary_encoded_array<std::int64_t>>(ar));
default:
throw std::runtime_error("data type of dictionary encoded array must be an integer");
}
}
const auto dt = ar.data_type();
return dispatch_base_type(std::forward<F>(func), ar, dt);
}
template <class F>
-> visit_result_t<F>
{
static constexpr auto table = make_dispatch_table<F>();
return table[static_cast<std::size_t>(dt)](std::forward<F>(func), ar);
}
template <class F>
[[nodiscard]]
inline auto visit(F&& func,
const array_wrapper& ar) -> std::invoke_result_t<F, null_array>
{
}
}
void sparse_union_array()
SPARROW_API void register_extension(data_type base_type, std::string_view extension_name, factory_func factory)
Register an extension type factory.
static SPARROW_API array_registry & instance()
Get the singleton registry instance.
array_registry & operator=(const array_registry &)=delete
std::function< cloning_ptr< array_wrapper >(arrow_proxy)> factory_func
Factory function type that creates an array_wrapper from an arrow_proxy.
std::invoke_result_t< F, null_array > visit_result_t
Visitor result type alias.
SPARROW_API void register_base_type(data_type dt, factory_func factory)
Register a base type factory.
std::function< bool(const arrow_proxy &)> extension_predicate
Extension predicate that checks if a proxy matches an extension type.
SPARROW_API cloning_ptr< array_wrapper > create(arrow_proxy proxy) const
Create an array wrapper from an arrow_proxy.
visit_result_t< F > dispatch(F &&func, const array_wrapper &ar) const
Base class for array type erasure.
binary_array_impl< std::int64_t > big_binary_array
Type alias for variable-size binary arrays with 64-bit offsets.
time_array< chrono::time_microseconds > time_microseconds_array
A time array for std::chrono::time_microseconds values.
constexpr std::array all_data_types
time_array< chrono::time_milliseconds > time_milliseconds_array
A time array for std::chrono::time_milliseconds values.
decimal_array< decimal< int128_t > > decimal_128_array
Type alias for 128-bit decimal array.
duration_array< std::chrono::seconds > duration_seconds_array
A duration array for std::chrono::seconds values.
list_array_impl< false > list_array
A list array implementation.
decimal_array< decimal< int32_t > > decimal_32_array
Type alias for 32-bit decimal array.
decimal_array< decimal< int64_t > > decimal_64_array
Type alias for 64-bit decimal array.
time_array< chrono::time_nanoseconds > time_nanoseconds_array
A time array for std::chrono::time_nanoseconds values.
typename dictionary_key_type< DT >::type dictionary_key_t
timestamp_without_timezone_array< zoned_time_without_timezone_seconds > timestamp_without_timezone_seconds_array
A timestamp without timezone array for zoned_time_without_timezone_seconds values.
date_array< date_milliseconds > date_milliseconds_array
A date array for date_milliseconds values.
decimal_array< decimal< int256_t > > decimal_256_array
Type alias for 256-bit decimal array.
string_array_impl< std::int64_t > big_string_array
Type alias for variable-size string arrays with 64-bit offsets.
list_view_array_impl< true > big_list_view_array
typename array_type_map< DT >::type array_type_t
date_array< date_days > date_days_array
A date array for date_days values.
auto visit(F &&func, const array_wrapper &ar) -> std::invoke_result_t< F, null_array >
string_array_impl< std::int32_t > string_array
Type alias for variable-size string arrays with 32-bit offsets.
SPARROW_API const date::time_zone * get_timezone(const arrow_proxy &proxy)
interval_array< month_day_nanoseconds_interval > month_day_nanoseconds_interval_array
An interval array for month_day_nanoseconds_interval values.
time_array< chrono::time_seconds > time_seconds_array
A time array for std::chrono::time_seconds values.
timestamp_array< timestamp_second > timestamp_seconds_array
Type aliases for timestamp arrays with common durations.
fixed_width_binary_array_impl< fixed_width_binary_traits::value_type, fixed_width_binary_traits::const_reference > fixed_width_binary_array
list_view_array_impl< false > list_view_array
A list view array implementation.
T & unwrap_array(array_wrapper &)
variable_size_binary_view_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference > string_view_array
A variable-size string view layout implementation.
timestamp_array< timestamp_nanosecond > timestamp_nanoseconds_array
primitive_array_impl< T, Ext, T2 > primitive_array
Array of values of whose type has fixed binary size.
list_array_impl< true > big_list_array
A big list array implementation.
interval_array< days_time_interval > days_time_interval_array
An interval array for days_time_interval values.
timestamp_without_timezone_array< zoned_time_without_timezone_nanoseconds > timestamp_without_timezone_nanoseconds_array
A timestamp without timezone array for zoned_time_without_timezone_nanoseconds values.
timestamp_without_timezone_array< zoned_time_without_timezone_milliseconds > timestamp_without_timezone_milliseconds_array
A timestamp without timezone array for zoned_time_without_timezone_milliseconds values.
binary_array_impl< std::int32_t > binary_array
Type alias for variable-size binary arrays with 32-bit offsets.
interval_array< chrono::months > months_interval_array
An interval array for std::chrono::months values.
duration_array< std::chrono::microseconds > duration_microseconds_array
A duration array for std::chrono::microseconds values.
duration_array< std::chrono::milliseconds > duration_milliseconds_array
A duration array for std::chrono::milliseconds values.
variable_size_binary_view_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference > binary_view_array
A variable-size binary view layout implementation.
duration_array< std::chrono::nanoseconds > duration_nanoseconds_array
A duration array for std::chrono::nanoseconds values.
timestamp_without_timezone_array< zoned_time_without_timezone_microseconds > timestamp_without_timezone_microseconds_array
A timestamp without timezone array for zoned_time_without_timezone_microseconds values.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
@ INTERVAL_MONTHS_DAYS_NANOSECONDS
timestamp_array< timestamp_microsecond > timestamp_microseconds_array
timestamp_array< timestamp_millisecond > timestamp_milliseconds_array