sparrow 2.4.0
C++20 idiomatic APIs for the Apache Arrow Columnar Format
Loading...
Searching...
No Matches
array_registry.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <array>
18#include <functional>
19#include <limits>
20#include <stdexcept>
21#include <string_view>
22#include <type_traits>
23#include <unordered_map>
24#include <utility>
25#include <vector>
26
31
32// Array type includes for dispatch support
40#include "sparrow/map_array.hpp"
51
52namespace sparrow
53{
54
55 // Primary template - undefined for unsupported types
56 // Specializations are defined in array_registry.hpp after all array types are included
57 template <data_type DT>
59
60 // Helper alias
61 template <data_type DT>
63
64 // Dictionary encoding type map (for integer types)
65 template <data_type DT>
67
68 template <data_type DT>
70
71 // Timestamp types with/without timezone
72 template <data_type DT>
74
75 // List of all supported data types - single source of truth
129
130 // clang-format off
131 // Template specializations for array_type_map - defined here after all array includes
132 template <> struct array_type_map<data_type::NA> { using type = null_array; };
133 template <> struct array_type_map<data_type::BOOL> { using type = primitive_array<bool>; };
145 template <> struct array_type_map<data_type::STRING> { using type = string_array; };
146 template <> struct array_type_map<data_type::STRING_VIEW> { using type = string_view_array; };
147 template <> struct array_type_map<data_type::LARGE_STRING> { using type = big_string_array; };
148 template <> struct array_type_map<data_type::BINARY> { using type = binary_array; };
149 template <> struct array_type_map<data_type::BINARY_VIEW> { using type = binary_view_array; };
150 template <> struct array_type_map<data_type::LARGE_BINARY> { using type = big_binary_array; };
151 template <> struct array_type_map<data_type::LIST> { using type = list_array; };
152 template <> struct array_type_map<data_type::LARGE_LIST> { using type = big_list_array; };
153 template <> struct array_type_map<data_type::LIST_VIEW> { using type = list_view_array; };
156 template <> struct array_type_map<data_type::STRUCT> { using type = struct_array; };
157 template <> struct array_type_map<data_type::MAP> { using type = map_array; };
159 template <> struct array_type_map<data_type::DENSE_UNION> { using type = dense_union_array; };
161 template <> struct array_type_map<data_type::DECIMAL32> { using type = decimal_32_array; };
162 template <> struct array_type_map<data_type::DECIMAL64> { using type = decimal_64_array; };
163 template <> struct array_type_map<data_type::DECIMAL128> { using type = decimal_128_array; };
164 template <> struct array_type_map<data_type::DECIMAL256> { using type = decimal_256_array; };
166 template <> struct array_type_map<data_type::DATE_DAYS> { using type = date_days_array; };
183
184 // Dictionary key type specializations
185 template <> struct dictionary_key_type<data_type::UINT8> { using type = std::uint8_t; };
186 template <> struct dictionary_key_type<data_type::INT8> { using type = std::int8_t; };
187 template <> struct dictionary_key_type<data_type::UINT16> { using type = std::uint16_t; };
188 template <> struct dictionary_key_type<data_type::INT16> { using type = std::int16_t; };
189 template <> struct dictionary_key_type<data_type::UINT32> { using type = std::uint32_t; };
190 template <> struct dictionary_key_type<data_type::INT32> { using type = std::int32_t; };
191 template <> struct dictionary_key_type<data_type::UINT64> { using type = std::uint64_t; };
192 template <> struct dictionary_key_type<data_type::INT64> { using type = std::int64_t; };
193
194 // Timestamp type specializations (with/without timezone)
211
212 // clang-format on
213
252 {
253 public:
254
256 using factory_func = std::function<cloning_ptr<array_wrapper>(arrow_proxy)>;
257
259 using extension_predicate = std::function<bool(const arrow_proxy&)>;
260
262 template <class F>
263 using visit_result_t = std::invoke_result_t<F, null_array>;
264
268 [[nodiscard]] SPARROW_API static array_registry& instance();
273
292
303 SPARROW_API void
304 register_extension(data_type base_type, std::string_view extension_name, factory_func factory);
305
316 SPARROW_API void
318
332
368 template <class F>
369 [[nodiscard]] visit_result_t<F> dispatch(F&& func, const array_wrapper& ar) const;
370
371 private:
372
374
375 // Helper for dispatching with compile-time type knowledge
376 template <class F, data_type DT>
377 static auto dispatch_for_type(F&& func, const array_wrapper& ar) -> visit_result_t<F>
378 {
379 if constexpr (
382 )
383 {
384 // Special handling for timestamp types with timezone check
385 using types = timestamp_type_map<DT>;
386 if (get_timezone(ar.get_arrow_proxy()) == nullptr)
387 {
388 return std::invoke(std::forward<F>(func), unwrap_array<typename types::without_tz>(ar));
389 }
390 else
391 {
392 return std::invoke(std::forward<F>(func), unwrap_array<typename types::with_tz>(ar));
393 }
394 }
395 else
396 {
397 return std::invoke(std::forward<F>(func), unwrap_array<array_type_t<DT>>(ar));
398 }
399 }
400
401 template <class F>
402 struct invoker
403 {
404 template <data_type DT>
405 static auto run(F&& func, const array_wrapper& ar) -> visit_result_t<F>
406 {
407 return dispatch_for_type<F, DT>(std::forward<F>(func), ar);
408 }
409 };
410
411 template <class F>
412 static consteval auto make_dispatch_table()
413 {
414 using result_t = visit_result_t<F>;
415 using invoker_t = result_t (*)(F&&, const array_wrapper&);
416
417 return []<std::size_t... I>(std::index_sequence<I...>)
418 {
419 return std::array<invoker_t, all_data_types.size()>{
420 &invoker<F>::template run<all_data_types[I]>...
421 };
422 }(std::make_index_sequence<all_data_types.size()>{});
423 }
424
425 // Helper method for dispatching base types
426 template <class F>
427 [[nodiscard]] visit_result_t<F>
428 dispatch_base_type(F&& func, const array_wrapper& ar, data_type dt) const;
429
430 struct extension_entry
431 {
432 extension_entry(extension_predicate pred, factory_func fact)
433 : predicate(std::move(pred))
434 , factory(std::move(fact))
435 {
436 }
437
438 extension_predicate predicate;
439 factory_func factory;
440
441 // Helper to check if this extension matches a wrapper
442 [[nodiscard]] bool matches(const array_wrapper& wrapper) const;
443 };
444
445 // Base type factories indexed by data_type
446 std::unordered_map<data_type, factory_func> m_base_factories;
447
448 // Extensions indexed by base data_type
449 std::unordered_map<data_type, std::vector<extension_entry>> m_extensions;
450
454 [[nodiscard]] static bool has_extension_name(const arrow_proxy& proxy, std::string_view extension_name);
455 };
456
457
458} // namespace sparrow
459
460namespace sparrow
461{
462 // ========== Template implementations ==========
463
464 template <class F>
465 inline auto array_registry::dispatch(F&& func, const array_wrapper& ar) const -> visit_result_t<F>
466 {
467 // Handle dictionary encoding first
468 if (ar.is_dictionary())
469 {
470 switch (ar.data_type())
471 {
472 case data_type::UINT8:
474 case data_type::INT8:
478 case data_type::INT16:
482 case data_type::INT32:
486 case data_type::INT64:
488 default:
489 throw std::runtime_error("data type of dictionary encoded array must be an integer");
490 }
491 }
492
493 const auto dt = ar.data_type();
494 return dispatch_base_type(std::forward<F>(func), ar, dt);
495 }
496
497 template <class F>
498 inline auto array_registry::dispatch_base_type(F&& func, const array_wrapper& ar, data_type dt) const
500 {
501 static constexpr auto table = make_dispatch_table<F>();
502 return table[static_cast<std::size_t>(dt)](std::forward<F>(func), ar);
503 }
504
505 // Standalone visit function for backward compatibility
506 template <class F>
507 [[nodiscard]] inline auto visit(F&& func, const array_wrapper& ar) -> std::invoke_result_t<F, null_array>
508 {
509 return array_registry::instance().dispatch(std::forward<F>(func), ar);
510 }
511
512} // namespace sparrow
SPARROW_API void register_extension(data_type base_type, extension_predicate predicate, factory_func factory)
Register an extension type with custom predicate.
SPARROW_API void register_extension(data_type base_type, std::string_view extension_name, factory_func factory)
Register an extension type factory.
static SPARROW_API array_registry & instance()
Get the singleton registry instance.
array_registry & operator=(const array_registry &)=delete
std::function< cloning_ptr< array_wrapper >(arrow_proxy)> factory_func
Factory function type that creates an array_wrapper from an arrow_proxy.
std::invoke_result_t< F, null_array > visit_result_t
Visitor result type alias.
SPARROW_API void register_base_type(data_type dt, factory_func factory)
Register a base type factory.
std::function< bool(const arrow_proxy &)> extension_predicate
Extension predicate that checks if a proxy matches an extension type.
SPARROW_API cloning_ptr< array_wrapper > create(arrow_proxy proxy) const
Create an array wrapper from an arrow_proxy.
array_registry(array_registry &&)=delete
array_registry & operator=(array_registry &&)=delete
visit_result_t< F > dispatch(F &&func, const array_wrapper &ar) const
array_registry(const array_registry &)=delete
Base class for array type erasure.
Smart pointer behaving like a copiable std::unique_ptr.
Definition memory.hpp:126
Dense union array implementation with offset buffer.
Forward declaration of dictionary_encoded_array.
Memory-efficient array implementation for null data types.
Sparse union array implementation without offset buffer.
#define SPARROW_API
Definition config.hpp:38
binary_array_impl< std::int64_t > big_binary_array
Type alias for variable-size binary arrays with 64-bit offsets.
time_array< chrono::time_microseconds > time_microseconds_array
A time array for std::chrono::time_microseconds values.
constexpr std::array all_data_types
time_array< chrono::time_milliseconds > time_milliseconds_array
A time array for std::chrono::time_milliseconds values.
decimal_array< decimal< int128_t > > decimal_128_array
Type alias for 128-bit decimal array.
duration_array< std::chrono::seconds > duration_seconds_array
A duration array for std::chrono::seconds values.
list_array_impl< false > list_array
A list array implementation.
decimal_array< decimal< int32_t > > decimal_32_array
Type alias for 32-bit decimal array.
decimal_array< decimal< int64_t > > decimal_64_array
Type alias for 64-bit decimal array.
time_array< chrono::time_nanoseconds > time_nanoseconds_array
A time array for std::chrono::time_nanoseconds values.
typename dictionary_key_type< DT >::type dictionary_key_t
timestamp_without_timezone_array< zoned_time_without_timezone_seconds > timestamp_without_timezone_seconds_array
A timestamp without timezone array for zoned_time_without_timezone_seconds values.
date_array< date_milliseconds > date_milliseconds_array
A date array for date_milliseconds values.
decimal_array< decimal< int256_t > > decimal_256_array
Type alias for 256-bit decimal array.
string_array_impl< std::int64_t > big_string_array
Type alias for variable-size string arrays with 64-bit offsets.
list_view_array_impl< true > big_list_view_array
typename array_type_map< DT >::type array_type_t
date_array< date_days > date_days_array
A date array for date_days values.
auto visit(F &&func, const array_wrapper &ar) -> std::invoke_result_t< F, null_array >
string_array_impl< std::int32_t > string_array
Type alias for variable-size string arrays with 32-bit offsets.
SPARROW_API const date::time_zone * get_timezone(const arrow_proxy &proxy)
interval_array< month_day_nanoseconds_interval > month_day_nanoseconds_interval_array
An interval array for month_day_nanoseconds_interval values.
time_array< chrono::time_seconds > time_seconds_array
A time array for std::chrono::time_seconds values.
timestamp_array< timestamp_second > timestamp_seconds_array
Type aliases for timestamp arrays with common durations.
fixed_width_binary_array_impl< fixed_width_binary_traits::value_type, fixed_width_binary_traits::const_reference > fixed_width_binary_array
list_view_array_impl< false > list_view_array
A list view array implementation.
T & unwrap_array(array_wrapper &)
variable_size_binary_view_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference > string_view_array
A variable-size string view layout implementation.
timestamp_array< timestamp_nanosecond > timestamp_nanoseconds_array
primitive_array_impl< T, Ext, T2 > primitive_array
Array of values of whose type has fixed binary size.
list_array_impl< true > big_list_array
A big list array implementation.
interval_array< days_time_interval > days_time_interval_array
An interval array for days_time_interval values.
timestamp_without_timezone_array< zoned_time_without_timezone_nanoseconds > timestamp_without_timezone_nanoseconds_array
A timestamp without timezone array for zoned_time_without_timezone_nanoseconds values.
timestamp_without_timezone_array< zoned_time_without_timezone_milliseconds > timestamp_without_timezone_milliseconds_array
A timestamp without timezone array for zoned_time_without_timezone_milliseconds values.
binary_array_impl< std::int32_t > binary_array
Type alias for variable-size binary arrays with 32-bit offsets.
interval_array< chrono::months > months_interval_array
An interval array for std::chrono::months values.
duration_array< std::chrono::microseconds > duration_microseconds_array
A duration array for std::chrono::microseconds values.
duration_array< std::chrono::milliseconds > duration_milliseconds_array
A duration array for std::chrono::milliseconds values.
variable_size_binary_view_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference > binary_view_array
A variable-size binary view layout implementation.
duration_array< std::chrono::nanoseconds > duration_nanoseconds_array
A duration array for std::chrono::nanoseconds values.
timestamp_without_timezone_array< zoned_time_without_timezone_microseconds > timestamp_without_timezone_microseconds_array
A timestamp without timezone array for zoned_time_without_timezone_microseconds values.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
timestamp_array< timestamp_microsecond > timestamp_microseconds_array
timestamp_array< timestamp_millisecond > timestamp_milliseconds_array
primitive_array< std::uint16_t > type
primitive_array< std::uint32_t > type
primitive_array< std::uint64_t > type
timestamp_without_timezone_microseconds_array without_tz
timestamp_without_timezone_milliseconds_array without_tz
timestamp_without_timezone_nanoseconds_array without_tz
timestamp_without_timezone_seconds_array without_tz