sparrow 1.3.0
Loading...
Searching...
No Matches
array_registry.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <array>
18#include <functional>
19#include <limits>
20#include <stdexcept>
21#include <string_view>
22#include <type_traits>
23#include <unordered_map>
24#include <utility>
25#include <vector>
26
31
32// Array type includes for dispatch support
40#include "sparrow/map_array.hpp"
51
52namespace sparrow
53{
54
55 // Primary template - undefined for unsupported types
56 // Specializations are defined in array_registry.hpp after all array types are included
57 template <data_type DT>
59
60 // Helper alias
61 template <data_type DT>
63
64 // Dictionary encoding type map (for integer types)
65 template <data_type DT>
67
68 template <data_type DT>
70
71 // Timestamp types with/without timezone
72 template <data_type DT>
74
75 // List of all supported data types - single source of truth
129
130 // clang-format off
131 // Template specializations for array_type_map - defined here after all array includes
132 template <> struct array_type_map<data_type::NA> { using type = null_array; };
133 template <> struct array_type_map<data_type::BOOL> { using type = primitive_array<bool>; };
145 template <> struct array_type_map<data_type::STRING> { using type = string_array; };
146 template <> struct array_type_map<data_type::STRING_VIEW> { using type = string_view_array; };
147 template <> struct array_type_map<data_type::LARGE_STRING> { using type = big_string_array; };
148 template <> struct array_type_map<data_type::BINARY> { using type = binary_array; };
149 template <> struct array_type_map<data_type::BINARY_VIEW> { using type = binary_view_array; };
150 template <> struct array_type_map<data_type::LARGE_BINARY> { using type = big_binary_array; };
151 template <> struct array_type_map<data_type::LIST> { using type = list_array; };
152 template <> struct array_type_map<data_type::LARGE_LIST> { using type = big_list_array; };
153 template <> struct array_type_map<data_type::LIST_VIEW> { using type = list_view_array; };
156 template <> struct array_type_map<data_type::STRUCT> { using type = struct_array; };
157 template <> struct array_type_map<data_type::MAP> { using type = map_array; };
159 template <> struct array_type_map<data_type::DENSE_UNION> { using type = dense_union_array; };
161 template <> struct array_type_map<data_type::DECIMAL32> { using type = decimal_32_array; };
162 template <> struct array_type_map<data_type::DECIMAL64> { using type = decimal_64_array; };
163 template <> struct array_type_map<data_type::DECIMAL128> { using type = decimal_128_array; };
164 template <> struct array_type_map<data_type::DECIMAL256> { using type = decimal_256_array; };
166 template <> struct array_type_map<data_type::DATE_DAYS> { using type = date_days_array; };
183
184 // Dictionary key type specializations
185 template <> struct dictionary_key_type<data_type::UINT8> { using type = std::uint8_t; };
186 template <> struct dictionary_key_type<data_type::INT8> { using type = std::int8_t; };
187 template <> struct dictionary_key_type<data_type::UINT16> { using type = std::uint16_t; };
188 template <> struct dictionary_key_type<data_type::INT16> { using type = std::int16_t; };
189 template <> struct dictionary_key_type<data_type::UINT32> { using type = std::uint32_t; };
190 template <> struct dictionary_key_type<data_type::INT32> { using type = std::int32_t; };
191 template <> struct dictionary_key_type<data_type::UINT64> { using type = std::uint64_t; };
192 template <> struct dictionary_key_type<data_type::INT64> { using type = std::int64_t; };
193
194 // Timestamp type specializations (with/without timezone)
211
212 // clang-format on
213
252 {
253 public:
254
256 using factory_func = std::function<cloning_ptr<array_wrapper>(arrow_proxy)>;
257
259 using extension_predicate = std::function<bool(const arrow_proxy&)>;
260
262 template <class F>
263 using visit_result_t = std::invoke_result_t<F, null_array>;
264
268 [[nodiscard]] SPARROW_API static array_registry& instance();
273
292
303 SPARROW_API void
304 register_extension(data_type base_type, std::string_view extension_name, factory_func factory);
305
316 SPARROW_API void
318
332
368 template <class F>
369 [[nodiscard]] visit_result_t<F> dispatch(F&& func, const array_wrapper& ar) const;
370
371 private:
372
374
375 // Helper for dispatching with compile-time type knowledge
376 template <class F, data_type DT>
377 static auto dispatch_for_type(F&& func, const array_wrapper& ar) -> visit_result_t<F>
378 {
381 {
382 // Special handling for timestamp types with timezone check
383 using types = timestamp_type_map<DT>;
384 if (get_timezone(ar.get_arrow_proxy()) == nullptr)
385 {
386 return std::invoke(std::forward<F>(func), unwrap_array<typename types::without_tz>(ar));
387 }
388 else
389 {
390 return std::invoke(std::forward<F>(func), unwrap_array<typename types::with_tz>(ar));
391 }
392 }
393 else
394 {
395 return std::invoke(std::forward<F>(func), unwrap_array<array_type_t<DT>>(ar));
396 }
397 }
398
399 template <class F>
400 struct invoker
401 {
402 template <data_type DT>
403 static auto run(F&& func, const array_wrapper& ar) -> visit_result_t<F>
404 {
405 return dispatch_for_type<F, DT>(std::forward<F>(func), ar);
406 }
407 };
408
409 template <class F>
410 static consteval auto make_dispatch_table()
411 {
412 using result_t = visit_result_t<F>;
413 using invoker_t = result_t (*)(F&&, const array_wrapper&);
414
415 return []<std::size_t... I>(std::index_sequence<I...>)
416 {
417 return std::array<invoker_t, all_data_types.size()>{
418 &invoker<F>::template run<all_data_types[I]>...
419 };
420 }(std::make_index_sequence<all_data_types.size()>{});
421 }
422
423 // Helper method for dispatching base types
424 template <class F>
425 [[nodiscard]] visit_result_t<F>
426 dispatch_base_type(F&& func, const array_wrapper& ar, data_type dt) const;
427
428 struct extension_entry
429 {
430 extension_entry(extension_predicate pred, factory_func fact)
431 : predicate(std::move(pred))
432 , factory(std::move(fact))
433 {
434 }
435
436 extension_predicate predicate;
437 factory_func factory;
438
439 // Helper to check if this extension matches a wrapper
440 [[nodiscard]] bool matches(const array_wrapper& wrapper) const;
441 };
442
443 // Base type factories indexed by data_type
444 std::unordered_map<data_type, factory_func> m_base_factories;
445
446 // Extensions indexed by base data_type
447 std::unordered_map<data_type, std::vector<extension_entry>> m_extensions;
448
452 [[nodiscard]] static bool has_extension_name(const arrow_proxy& proxy, std::string_view extension_name);
453 };
454
455
456} // namespace sparrow
457
458namespace sparrow
459{
460 // ========== Template implementations ==========
461
462 template <class F>
463 inline auto array_registry::dispatch(F&& func, const array_wrapper& ar) const -> visit_result_t<F>
464 {
465 // Handle dictionary encoding first
466 if (ar.is_dictionary())
467 {
468 switch (ar.data_type())
469 {
470 case data_type::UINT8:
472 case data_type::INT8:
476 case data_type::INT16:
480 case data_type::INT32:
484 case data_type::INT64:
486 default:
487 throw std::runtime_error("data type of dictionary encoded array must be an integer");
488 }
489 }
490
491 const auto dt = ar.data_type();
492 return dispatch_base_type(std::forward<F>(func), ar, dt);
493 }
494
495 template <class F>
496 inline auto array_registry::dispatch_base_type(F&& func, const array_wrapper& ar, data_type dt) const
498 {
499 static constexpr auto table = make_dispatch_table<F>();
500 return table[static_cast<std::size_t>(dt)](std::forward<F>(func), ar);
501 }
502
503 // Standalone visit function for backward compatibility
504 template <class F>
505 [[nodiscard]] inline auto visit(F&& func, const array_wrapper& ar) -> std::invoke_result_t<F, null_array>
506 {
507 return array_registry::instance().dispatch(std::forward<F>(func), ar);
508 }
509
510} // namespace sparrow
SPARROW_API void register_extension(data_type base_type, extension_predicate predicate, factory_func factory)
Register an extension type with custom predicate.
SPARROW_API void register_extension(data_type base_type, std::string_view extension_name, factory_func factory)
Register an extension type factory.
static SPARROW_API array_registry & instance()
Get the singleton registry instance.
array_registry & operator=(const array_registry &)=delete
std::function< cloning_ptr< array_wrapper >(arrow_proxy)> factory_func
Factory function type that creates an array_wrapper from an arrow_proxy.
std::invoke_result_t< F, null_array > visit_result_t
Visitor result type alias.
SPARROW_API void register_base_type(data_type dt, factory_func factory)
Register a base type factory.
std::function< bool(const arrow_proxy &)> extension_predicate
Extension predicate that checks if a proxy matches an extension type.
SPARROW_API cloning_ptr< array_wrapper > create(arrow_proxy proxy) const
Create an array wrapper from an arrow_proxy.
array_registry(array_registry &&)=delete
array_registry & operator=(array_registry &&)=delete
visit_result_t< F > dispatch(F &&func, const array_wrapper &ar) const
array_registry(const array_registry &)=delete
Base class for array type erasure.
Smart pointer behaving like a copiable std::unique_ptr.
Definition memory.hpp:126
Dense union array implementation with offset buffer.
Forward declaration of dictionary_encoded_array.
Memory-efficient array implementation for null data types.
Sparse union array implementation without offset buffer.
#define SPARROW_API
Definition config.hpp:38
binary_array_impl< std::int64_t > big_binary_array
Type alias for variable-size binary arrays with 64-bit offsets.
time_array< chrono::time_microseconds > time_microseconds_array
A time array for std::chrono::time_microseconds values.
constexpr std::array all_data_types
time_array< chrono::time_milliseconds > time_milliseconds_array
A time array for std::chrono::time_milliseconds values.
decimal_array< decimal< int128_t > > decimal_128_array
Type alias for 128-bit decimal array.
duration_array< std::chrono::seconds > duration_seconds_array
A duration array for std::chrono::seconds values.
list_array_impl< false > list_array
A list array implementation.
decimal_array< decimal< int32_t > > decimal_32_array
Type alias for 32-bit decimal array.
decimal_array< decimal< int64_t > > decimal_64_array
Type alias for 64-bit decimal array.
time_array< chrono::time_nanoseconds > time_nanoseconds_array
A time array for std::chrono::time_nanoseconds values.
typename dictionary_key_type< DT >::type dictionary_key_t
timestamp_without_timezone_array< zoned_time_without_timezone_seconds > timestamp_without_timezone_seconds_array
A timestamp without timezone array for zoned_time_without_timezone_seconds values.
date_array< date_milliseconds > date_milliseconds_array
A date array for date_milliseconds values.
decimal_array< decimal< int256_t > > decimal_256_array
Type alias for 256-bit decimal array.
string_array_impl< std::int64_t > big_string_array
Type alias for variable-size string arrays with 64-bit offsets.
list_view_array_impl< true > big_list_view_array
typename array_type_map< DT >::type array_type_t
date_array< date_days > date_days_array
A date array for date_days values.
auto visit(F &&func, const array_wrapper &ar) -> std::invoke_result_t< F, null_array >
string_array_impl< std::int32_t > string_array
Type alias for variable-size string arrays with 32-bit offsets.
SPARROW_API const date::time_zone * get_timezone(const arrow_proxy &proxy)
interval_array< month_day_nanoseconds_interval > month_day_nanoseconds_interval_array
An interval array for month_day_nanoseconds_interval values.
time_array< chrono::time_seconds > time_seconds_array
A time array for std::chrono::time_seconds values.
timestamp_array< timestamp_second > timestamp_seconds_array
Type aliases for timestamp arrays with common durations.
fixed_width_binary_array_impl< fixed_width_binary_traits::value_type, fixed_width_binary_traits::const_reference > fixed_width_binary_array
list_view_array_impl< false > list_view_array
A list view array implementation.
T & unwrap_array(array_wrapper &)
variable_size_binary_view_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference > string_view_array
A variable-size string view layout implementation.
timestamp_array< timestamp_nanosecond > timestamp_nanoseconds_array
primitive_array_impl< T, Ext, T2 > primitive_array
Array of values of whose type has fixed binary size.
list_array_impl< true > big_list_array
A big list array implementation.
interval_array< days_time_interval > days_time_interval_array
An interval array for days_time_interval values.
timestamp_without_timezone_array< zoned_time_without_timezone_nanoseconds > timestamp_without_timezone_nanoseconds_array
A timestamp without timezone array for zoned_time_without_timezone_nanoseconds values.
timestamp_without_timezone_array< zoned_time_without_timezone_milliseconds > timestamp_without_timezone_milliseconds_array
A timestamp without timezone array for zoned_time_without_timezone_milliseconds values.
binary_array_impl< std::int32_t > binary_array
Type alias for variable-size binary arrays with 32-bit offsets.
interval_array< chrono::months > months_interval_array
An interval array for std::chrono::months values.
duration_array< std::chrono::microseconds > duration_microseconds_array
A duration array for std::chrono::microseconds values.
duration_array< std::chrono::milliseconds > duration_milliseconds_array
A duration array for std::chrono::milliseconds values.
variable_size_binary_view_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference > binary_view_array
A variable-size binary view layout implementation.
duration_array< std::chrono::nanoseconds > duration_nanoseconds_array
A duration array for std::chrono::nanoseconds values.
timestamp_without_timezone_array< zoned_time_without_timezone_microseconds > timestamp_without_timezone_microseconds_array
A timestamp without timezone array for zoned_time_without_timezone_microseconds values.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
timestamp_array< timestamp_microsecond > timestamp_microseconds_array
timestamp_array< timestamp_millisecond > timestamp_milliseconds_array
primitive_array< std::uint16_t > type
primitive_array< std::uint32_t > type
primitive_array< std::uint64_t > type
timestamp_without_timezone_microseconds_array without_tz
timestamp_without_timezone_milliseconds_array without_tz
timestamp_without_timezone_nanoseconds_array without_tz
timestamp_without_timezone_seconds_array without_tz