sparrow ..
Loading...
Searching...
No Matches
builder.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15
16#pragma once
17
18#include <map>
19#include <ranges>
20#include <tuple>
21#include <type_traits>
22#include <unordered_map>
23#include <utility>
24#include <vector>
25
26#include "sparrow/array.hpp"
44
45namespace sparrow
46{
47
48 // forward declaration
49 namespace detail
50 {
51 template <class T, class LAYOUT_POLICY, class OPTIONS_TYPE>
52 struct builder;
53 }
54
56 {
57 };
58
60 {
61 };
62
64 {
65 };
66
68 {
69 };
70
71 // option flag to indicate the desire for large lists
73
79 template <class T, class... OPTION_FLAGS>
80 [[nodiscard]] constexpr auto build(T&& t, OPTION_FLAGS&&...)
81 {
82 // for toplevel build calls, the layout policy is determined by the type itself
83 using decayed_t = std::decay_t<T>;
84 using layout_policy = detail::layout_flag_t<decayed_t>;
85 using option_flags_type = sparrow::mpl::typelist<std::decay_t<OPTION_FLAGS>...>;
86
88 {
89 // directely unpack
90 using value_type = typename decayed_t::value_type;
92 std::forward<T>(t).get()
93 );
94 }
95 else if constexpr (detail::is_nullable_like<T>)
96 {
97 static_assert(mpl::dependent_false<T>::value, "toplevel type must not be nullable");
98 }
99 else
100 {
102 }
103 }
104
105 template <class T, class... OPTION_FLAGS>
106 [[nodiscard]] constexpr auto build(std::initializer_list<T> t, OPTION_FLAGS&&... flags)
107 {
108 auto subranges = std::views::all(t);
109 return build(std::forward<decltype(subranges)>(subranges), std::forward<OPTION_FLAGS>(flags)...);
110 }
111
112 namespace detail
113 {
114 // this is called by the nested recursive calls
115 template <class LAYOUT_POLICY, class T, class... OPTION_FLAGS>
116 [[nodiscard]] constexpr auto
117 build_impl(T&& t, [[maybe_unused]] sparrow::mpl::typelist<OPTION_FLAGS...> typelist)
118 {
119 using option_flags_type = sparrow::mpl::typelist<OPTION_FLAGS...>;
121 }
122
123 template <class T>
124 concept translates_to_primitive_layout = std::ranges::input_range<T>
125 && std::is_scalar_v<ensured_range_value_t<T>>;
126
127 template <typename T>
128 concept translates_to_date_layout = std::ranges::input_range<T>
129 && mpl::any_of(
130 date_types_t{},
132 );
133 template <typename T>
134 concept translates_to_duration_layout = std::ranges::input_range<T>
135 && mpl::any_of(
138 );
139 template <typename T>
140 concept translates_to_timestamp_layout = std::ranges::input_range<T>
142
143 template <typename T>
144 concept translates_to_timestamp_without_timezone_layout = std::ranges::input_range<T>
145 && mpl::any_of(
149 );
150
151
152 template <typename T>
153 concept translates_to_interval_layout = std::ranges::input_range<T>
154 && mpl::any_of(
157 );
158
159 template <typename T>
160 concept translates_to_time_layout = std::ranges::input_range<T>
161 && mpl::any_of(
162 time_types_t{},
164 );
165
166 template <class T>
167 concept translate_to_variable_sized_list_layout = std::ranges::input_range<T>
168 && std::ranges::input_range<ensured_range_value_t<T>>
170 && // tuples go to struct layout
171 // value type of inner should not be 'char-like'(
172 // char, byte, uint8), these are handled by
173 // variable_size_binary_array
175
176 template <typename T>
179
180 template <class T>
181 concept translate_to_struct_layout = std::ranges::input_range<T> && tuple_like<ensured_range_value_t<T>>
184
185 template <class T>
187 && std::is_same_v<std::ranges::range_value_t<T>, byte_t>;
188
189 template <class T>
190 concept translate_to_fixed_sized_list_layout = std::ranges::input_range<T>
192 && !(
196 )
198
199 template <class T>
200 concept translate_to_variable_sized_binary_layout = std::ranges::input_range<T>
201 && std::ranges::input_range<ensured_range_value_t<T>>
202 && !(
206 )
208 && // tuples go to struct layout
209 // value type of inner must be char like ( char,
210 // byte, uint8)
212
213 template <class T>
214 concept translate_to_fixed_width_binary_layout = std::ranges::input_range<T>
218
219 template <class T>
220 concept translate_to_union_layout = std::ranges::input_range<T> &&
221 // value type must be a variant-like type
222 // *NOTE* we don't check for nullable here, as we want to handle
223 // nullable variants as in the arrow spec, the nulls are handled
224 // by the elements **in** the variant
226
227 template <translates_to_primitive_layout T, class OPTION_FLAGS>
228 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
229 {
231
232 template <class U>
233 [[nodiscard]] static constexpr type create(U&& t)
234 {
235 return type(std::forward<U>(t));
236 }
237 };
238
239 template <translates_to_date_layout T, class OPTION_FLAGS>
240 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
241 {
243
244 template <class U>
245 static constexpr type create(U&& t)
246 {
247 return type(std::forward<U>(t));
248 }
249 };
250
251 template <translates_to_duration_layout T, class OPTION_FLAGS>
252 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
253 {
255
256 template <class U>
257 [[nodiscard]] static constexpr type create(U&& t)
258 {
259 return type(std::forward<U>(t));
260 }
261 };
262
263 template <translates_to_timestamp_layout T, class OPTION_FLAGS>
264 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
265 {
267 using timezone_ptr = std::decay_t<decltype(std::declval<ensured_range_value_t<T>>().get_time_zone())>;
268
269 template <class U>
270 [[nodiscard]] static constexpr type create(U&& t)
271 {
272 timezone_ptr tz = [&t]() -> timezone_ptr
273 {
274 if (t.empty())
275 {
276 return nullptr;
277 }
278 else
279 {
280 return t.begin()->get_time_zone();
281 }
282 }();
283 return type(tz, std::forward<U>(t));
284 }
285 };
286
287 template <translates_to_timestamp_without_timezone_layout T, class OPTION_FLAGS>
288 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
289 {
291
292 template <class U>
293 [[nodiscard]] static constexpr type create(U&& t)
294 {
295 return type(std::forward<U>(t));
296 }
297 };
298
299 template <translates_to_interval_layout T, class OPTION_FLAGS>
300 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
301 {
303
304 template <class U>
305 [[nodiscard]] static constexpr type create(U&& t)
306 {
307 return type(std::forward<U>(t));
308 }
309 };
310
311 template <translates_to_time_layout T, class OPTION_FLAGS>
312 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
313 {
315
316 template <class U>
317 static constexpr type create(U&& t)
318 {
319 return type(std::forward<U>(t));
320 }
321 };
322
323 template <translate_to_variable_sized_list_layout T, class OPTION_FLAGS>
324 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
325 {
326 using raw_value_type = std::ranges::range_value_t<T>;
327
328 using type = std::conditional_t<
332
333 template <class U>
334 [[nodiscard]] static type create(U&& t)
335 {
336 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
337
338 auto sizes = t
339 | std::views::transform(
340 [](const auto& l)
341 {
342 return get_size_save(l);
343 }
344 );
345
346 // when the raw_value_type is a "express layout desire" we need to
347 // propagate this information to the builder, so it can handle the
348 using layout_policy_type = layout_flag_t<raw_value_type>;
349 auto typed_array = build_impl<layout_policy_type>(flat_list_view, OPTION_FLAGS{});
350 auto detyped_array = array(std::move(typed_array));
351
352 return type(std::move(detyped_array), type::offset_from_sizes(sizes), where_null(t));
353 }
354 };
355
356 template <translate_to_fixed_sized_list_layout T, class OPTION_FLAGS>
357 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
358 {
360 static constexpr std::size_t
361 list_size = std::tuple_size_v<look_trough_t<std::ranges::range_value_t<T>>>;
362 using raw_value_type = std::ranges::range_value_t<T>;
363
364 template <class U>
365 [[nodiscard]] static type create(U&& t)
366 {
367 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
368
369 // when the raw_value_type is a "express layout desire" we need to
370 // propagate this information to the builder.
371 using layout_policy_type = layout_flag_t<raw_value_type>;
372
373 return type(
374 static_cast<std::uint64_t>(list_size),
375 array(build_impl<layout_policy_type>(flat_list_view, OPTION_FLAGS{})),
376 where_null(t)
377 );
378 }
379 };
380
381 template <translate_to_map_layout T, class OPTION_FLAGS>
382 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
383 {
385 using raw_value_type = std::ranges::range_value_t<T>;
386 using key_type = typename raw_value_type::first_type;
387 using value_type = typename raw_value_type::second_type;
388
389 template <class U>
390 [[nodiscard]] static type create(U&& t)
391 {
392 auto flat_keys = t
393 | std::views::transform(
394 [](const auto& kv)
395 {
396 return kv.first;
397 }
398 );
399 auto flat_items = t
400 | std::views::transform(
401 [](const auto& kv)
402 {
403 return kv.second;
404 }
405 );
406
407 // when the raw_value_type is a "express layout desire" we need to
408 // propagate this information to the builder, so it can handle the
409 using layout_policy_type = layout_flag_t<raw_value_type>;
410 auto keys_array = build_impl<layout_policy_type>(flat_keys, OPTION_FLAGS{});
411 auto items_array = build_impl<layout_policy_type>(flat_items, OPTION_FLAGS{});
412 auto offset = map_array::offset_from_sizes(sparrow::repeat_view<size_t>(1, std::ranges::size(t)));
413
414 return type(
415 sparrow::array{std::move(keys_array)},
416 sparrow::array{std::move(items_array)},
417 std::move(offset),
418 where_null(t)
419 );
420 }
421 };
422
423 template <translate_to_struct_layout T, class OPTION_FLAGS>
424 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
425 {
427 static constexpr std::size_t n_children = std::tuple_size_v<mnv_t<std::ranges::range_value_t<T>>>;
429
430 template <class U>
431 [[nodiscard]] static type create(U&& t)
432 {
433 std::vector<array> detyped_children(n_children);
435 [&](auto i)
436 {
437 auto tuple_i_col = t
438 | std::views::transform(
439 [](const auto& maybe_nullable_tuple)
440 {
441 const auto& tuple_val = ensure_value(maybe_nullable_tuple);
442 return std::get<decltype(i)::value>(tuple_val);
443 }
444 );
445
446 using tuple_element_type = std::tuple_element_t<decltype(i)::value, tuple_type>;
447 using layout_policy_type = layout_flag_t<tuple_element_type>;
448 detyped_children[decltype(i)::value] = array(
449 build_impl<layout_policy_type>(tuple_i_col, OPTION_FLAGS{})
450 );
451 }
452 );
453
454 return type(std::move(detyped_children), where_null(t));
455 }
456 };
457
458 template <translate_to_variable_sized_binary_layout T, class OPTION_FLAGS>
459 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
460 {
462
463 template <class U>
464 [[nodiscard]] static type create(U&& t)
465 {
466 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
467 u8_buffer<char> data_buffer(flat_list_view);
468
469 auto sizes = t
470 | std::views::transform(
471 [](const auto& l)
472 {
473 return get_size_save(l);
474 }
475 );
476
477 return type(std::move(data_buffer), type::offset_from_sizes(sizes), where_null(t));
478 }
479 };
480
481 template <translate_to_fixed_width_binary_layout T, class OPTION_FLAGS>
482 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
483 {
485
486 template <class U>
487 [[nodiscard]] static type create(U&& t)
488 {
489 return type(std::forward<U>(t));
490 }
491 };
492
493 template <translate_to_union_layout T, class OPTION_FLAGS>
494 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
495 {
496 using type = sparrow::sparse_union_array; // TODO use options to select between sparse and dense
497 using variant_type = std::ranges::range_value_t<T>;
498 static constexpr std::size_t variant_size = std::variant_size_v<variant_type>;
499
500 template <class U>
501 [[nodiscard]] static type create(U&& t)
502 requires(std::is_same_v<type, sparrow::sparse_union_array>)
503 {
504 std::vector<array> detyped_children(variant_size);
506 [&](auto i)
507 {
508 using type_at_index = std::variant_alternative_t<decltype(i)::value, variant_type>;
509 auto type_i_col = t
510 | std::views::transform(
511 [](const auto& variant)
512 {
513 return variant.index() == decltype(i)::value
514 ? std::get<type_at_index>(variant)
515 : type_at_index{};
516 }
517 );
518
519 using layout_policy_type = layout_flag_t<type_at_index>;
520 detyped_children[decltype(i)::value] = array(
521 build_impl<layout_policy_type>(type_i_col, OPTION_FLAGS{})
522 );
523 }
524 );
525
526 // type-ids
527 auto type_id_range = t
528 | std::views::transform(
529 [](const auto& v)
530 {
531 return static_cast<std::uint8_t>(v.index());
532 }
533 );
534 sparrow::u8_buffer<std::uint8_t> type_id_buffer(type_id_range);
535
536 return type(std::move(detyped_children), std::move(type_id_buffer));
537 }
538 };
539
540 template <class T, class OPTION_FLAGS>
541 struct builder<T, enforce_dict_encoded_layout, OPTION_FLAGS>
542 {
543 using key_type = std::uint64_t;
545 // keep the nulls
546 using raw_range_value_type = std::ranges::range_value_t<T>;
547
548 template <class U>
549 [[nodiscard]] static type create(U&& t)
550 {
551 const auto input_size = range_size(t);
552 key_type key = 0;
553 std::map<raw_range_value_type, key_type, nested_less<raw_range_value_type>> value_map;
554 std::vector<raw_range_value_type> values;
555 std::vector<key_type> keys;
556
557 values.reserve(input_size);
558 keys.reserve(input_size);
559
560 for (const auto& v : t)
561 {
562 auto find_res = value_map.find(v);
563 if (find_res == value_map.end())
564 {
565 value_map.insert({v, key});
566 values.push_back(v);
567 keys.push_back(key);
568 ++key;
569 }
570 else
571 {
572 keys.push_back(find_res->second);
573 }
574 }
575 auto keys_buffer = sparrow::u8_buffer<key_type>(keys);
576
577 // since we do not support dict[dict or dict[run_end
578 // we can hard code the layout policy here
579 using layout_policy_type = dont_enforce_layout;
580
581 auto values_array = build_impl<layout_policy_type>(values, OPTION_FLAGS{});
582
583 return type(std::move(keys_buffer), array(std::move(values_array)));
584 }
585 };
586
587 template <class T, class OPTION_FLAGS>
589 {
591 using raw_range_value_type = std::ranges::range_value_t<T>;
592
593 template <class U>
594 [[nodiscard]] static type create(U&& t)
595 {
596 using value_type = std::decay_t<raw_range_value_type>;
597
598 const auto input_size = range_size(t);
599
600 std::vector<value_type> values{};
601 std::vector<int64_t> acc_run_lengths{};
602
603 values.reserve(input_size);
604 acc_run_lengths.reserve(input_size);
605
606 const auto eq = nested_eq<value_type>{};
607
608 // accumulate the run lengths
609 int64_t i = 0;
610 for (const auto& v : t)
611 {
612 // first value
613 if (i == 0)
614 {
615 values.push_back(v);
616 }
617 // rest
618 else
619 {
620 if (!eq(values.back(), v))
621 {
622 acc_run_lengths.push_back(i);
623 values.push_back(v);
624 }
625 }
626 ++i;
627 }
628 acc_run_lengths.push_back(i);
629
630 auto run_length_typed_array = primitive_array<int64_t>(acc_run_lengths);
631
632 // since we do not support dict[dict or dict[run_end
633 // we can hard code the layout policy here
634 using layout_policy_type = dont_enforce_layout;
635 auto values_array = build_impl<layout_policy_type>(values, OPTION_FLAGS{});
636
637 return type(array(std::move(run_length_typed_array)), array(std::move(values_array)));
638 }
639 };
640 } // namespace detail
641} // namespace sparrow
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:41
Forward declaration of dictionary_encoded_array.
static auto offset_from_sizes(SIZES_RANGE &&sizes) -> offset_buffer_type
Creates offset buffer from map sizes.
A view that repeats a value a given number of times.
Sparse union array implementation without offset buffer.
Array implementation for storing timestamp values with timezone information.
This buffer class is used as storage buffer for all sparrow arrays.
Concept for character-like types.
Concept for std::array types.
std::conditional_t< is_dict_encode< mnv_t< T > >, enforce_dict_encoded_layout, std::conditional_t< is_run_end_encode< mnv_t< T > >, enforce_run_end_encoded_layout, dont_enforce_layout > > layout_flag_t
constexpr decltype(auto) ensure_value_range(T &&t) noexcept
look_trough_t< std::ranges::range_value_t< T > > ensured_range_value_t
ensured_range_value_t< ensured_range_value_t< T > > nested_ensured_range_inner_value_t
constexpr std::size_t get_size_save(T &&t)
constexpr auto build_impl(T &&t, sparrow::mpl::typelist< OPTION_FLAGS... > typelist)
Definition builder.hpp:117
constexpr std::vector< std::size_t > where_null(T &&t)
constexpr decltype(auto) ensure_value(T &&t)
void for_each_index(F &&f)
consteval bool any_of(L< T... >, Predicate predicate={})
Checks if at least one type in the typelist satisfies the predicate.
Definition mp_utils.hpp:450
constexpr bool is_type_instance_of_v
Variable template for convenient access to is_type_instance_of.
Definition mp_utils.hpp:102
consteval bool contains()
Checks if a typelist contains a specific type.
Definition mp_utils.hpp:633
std::byte byte_t
primitive_array_impl< T > time_array
Array of time values.
mpl:: typelist< chrono::time_seconds, chrono::time_milliseconds, chrono::time_microseconds, chrono::time_nanoseconds > time_types_t
list_array_impl< false > list_array
A list array implementation.
constexpr auto build(T &&t, OPTION_FLAGS &&...)
function to create a sparrow array from arbitrary nested combinations of ranges, tuples,...
Definition builder.hpp:80
primitive_array_impl< T > interval_array
Array of interval values.
mpl::typelist< zoned_time_without_timezone_seconds, zoned_time_without_timezone_milliseconds, zoned_time_without_timezone_microseconds, zoned_time_without_timezone_nanoseconds > zoned_time_without_timezone_types_t
mpl::typelist< date_days, date_milliseconds > date_types_t
constexpr std::size_t range_size(R &&r)
Definition ranges.hpp:32
constexpr large_list_flag_t large_list_flag
Definition builder.hpp:72
primitive_array_impl< T > date_array
Array of std::chrono::duration values.
string_array_impl< std::int32_t > string_array
Type alias for variable-size string arrays with 32-bit offsets.
primitive_array_impl< T > primitive_array
Array of values of whose type has fixed binary size.
date::zoned_time< Duration, TimeZonePtr > timestamp
fixed_width_binary_array_impl< fixed_width_binary_traits::value_type, fixed_width_binary_traits::const_reference > fixed_width_binary_array
primitive_array_impl< T > timestamp_without_timezone_array
Array of timestamps without timezone.
list_array_impl< true > big_list_array
A big list array implementation.
primitive_array_impl< T > duration_array
Array of std::chrono::duration values.
mpl::typelist< chrono::months, days_time_interval, month_day_nanoseconds_interval > interval_types_t
mpl:: typelist< std::chrono::seconds, std::chrono::milliseconds, std::chrono::microseconds, std::chrono::nanoseconds > duration_types_t
sparrow::primitive_array< ensured_range_value_t< T > > type
Definition builder.hpp:230
std::decay_t< decltype(std::declval< ensured_range_value_t< T > >().get_time_zone())> timezone_ptr
Definition builder.hpp:267
Workaround to replace static_assert(false) in template code.
Definition mp_utils.hpp:54
Compile-time type predicate: true if the evaluated type is the same as T.
Definition mp_utils.hpp:396
A sequence of types used for metaprogramming operations.
Definition mp_utils.hpp:123