sparrow 0.9.0
Loading...
Searching...
No Matches
builder.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15
16#pragma once
17
18#include <map>
19#include <ranges>
20#include <tuple>
21#include <type_traits>
22#include <unordered_map>
23#include <utility>
24#include <vector>
25
26#include "sparrow/array.hpp"
44
45namespace sparrow
46{
47
48 // forward declaration
49 namespace detail
50 {
51 template <class T, class LAYOUT_POLICY, class OPTIONS_TYPE>
52 struct builder;
53 }
54
56 {
57 };
58
60 {
61 };
62
64 {
65 };
66
68 {
69 };
70
71 // option flag to indicate the desire for large lists
73
79 template <class T, class... OPTION_FLAGS>
80 [[nodiscard]] constexpr auto build(T&& t, OPTION_FLAGS&&...)
81 {
82 // for toplevel build calls, the layout policy is determined by the type itself
83 using decayed_t = std::decay_t<T>;
84 using layout_policy = detail::layout_flag_t<decayed_t>;
85 using option_flags_type = sparrow::mpl::typelist<std::decay_t<OPTION_FLAGS>...>;
86
88 {
89 // directely unpack
90 using value_type = typename decayed_t::value_type;
92 std::forward<T>(t).get()
93 );
94 }
95 else if constexpr (detail::is_nullable_like<T>)
96 {
97 static_assert(mpl::dependent_false<T>::value, "toplevel type must not be nullable");
98 }
99 else
100 {
102 }
103 }
104
105 template <class T, class... OPTION_FLAGS>
106 [[nodiscard]] constexpr auto build(std::initializer_list<T> t, OPTION_FLAGS&&... flags)
107 {
108 auto subranges = std::views::all(t);
109 return build(std::forward<decltype(subranges)>(subranges), std::forward<OPTION_FLAGS>(flags)...);
110 }
111
112 namespace detail
113 {
114 // this is called by the nested recursive calls
115 template <class LAYOUT_POLICY, class T, class... OPTION_FLAGS>
116 [[nodiscard]] constexpr auto
117 build_impl(T&& t, [[maybe_unused]] sparrow::mpl::typelist<OPTION_FLAGS...> typelist)
118 {
119 using option_flags_type = sparrow::mpl::typelist<OPTION_FLAGS...>;
121 }
122
123 template <class T>
124 concept translates_to_primitive_layout = std::ranges::input_range<T>
125 && std::is_scalar_v<ensured_range_value_t<T>>;
126
127 template <typename T>
128 concept translates_to_date_layout = std::ranges::input_range<T>
129 && mpl::any_of(
130 date_types_t{},
132 );
133 template <typename T>
134 concept translates_to_duration_layout = std::ranges::input_range<T>
135 && mpl::any_of(
138 );
139 template <typename T>
140 concept translates_to_timestamp_layout = std::ranges::input_range<T>
142
143 template <typename T>
144 concept translates_to_timestamp_without_timezone_layout = std::ranges::input_range<T>
145 && mpl::any_of(
149 );
150
151
152 template <typename T>
153 concept translates_to_interval_layout = std::ranges::input_range<T>
154 && mpl::any_of(
157 );
158
159 template <typename T>
160 concept translates_to_time_layout = std::ranges::input_range<T>
161 && mpl::any_of(
162 time_types_t{},
164 );
165
166 template <class T>
167 concept translate_to_variable_sized_list_layout = std::ranges::input_range<T>
168 && std::ranges::input_range<ensured_range_value_t<T>>
170 && // tuples go to struct layout
171 // value type of inner should not be 'char-like'(
172 // char, byte, uint8), these are handled by
173 // variable_size_binary_array
175
176 template <typename T>
179
180 template <class T>
181 concept translate_to_struct_layout = std::ranges::input_range<T> && tuple_like<ensured_range_value_t<T>>
184
185 template <class T>
187 && std::is_same_v<std::ranges::range_value_t<T>, byte_t>;
188
189 template <class T>
190 concept translate_to_fixed_sized_list_layout = std::ranges::input_range<T>
192 && !(
196 )
198
199 template <class T>
200 concept translate_to_variable_sized_binary_layout = std::ranges::input_range<T>
201 && std::ranges::input_range<ensured_range_value_t<T>>
202 && !(
206 )
208 && // tuples go to struct layout
209 // value type of inner must be char like ( char,
210 // byte, uint8)
212
213 template <class T>
214 concept translate_to_fixed_width_binary_layout = std::ranges::input_range<T>
218
219 template <class T>
220 concept translate_to_union_layout = std::ranges::input_range<T> &&
221 // value type must be a variant-like type
222 // *NOTE* we don't check for nullable here, as we want to handle
223 // nullable variants as in the arrow spec, the nulls are handled
224 // by the elements **in** the variant
226
227 template <translates_to_primitive_layout T, class OPTION_FLAGS>
228 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
229 {
231
232 template <class U>
233 [[nodiscard]] static constexpr type create(U&& t)
234 {
235 return type(std::forward<U>(t));
236 }
237 };
238
239 template <translates_to_date_layout T, class OPTION_FLAGS>
240 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
241 {
243
244 template <class U>
245 static constexpr type create(U&& t)
246 {
247 return type(std::forward<U>(t));
248 }
249 };
250
251 template <translates_to_duration_layout T, class OPTION_FLAGS>
252 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
253 {
255
256 template <class U>
257 [[nodiscard]] static constexpr type create(U&& t)
258 {
259 return type(std::forward<U>(t));
260 }
261 };
262
263 template <translates_to_timestamp_layout T, class OPTION_FLAGS>
264 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
265 {
267 using timezone_ptr = std::decay_t<decltype(std::declval<ensured_range_value_t<T>>().get_time_zone())>;
268
269 template <class U>
270 [[nodiscard]] static constexpr type create(U&& t)
271 {
272 timezone_ptr tz = [&t]() -> timezone_ptr
273 {
274 if (t.empty())
275 {
276 return nullptr;
277 }
278 else
279 {
280 return t.begin()->get_time_zone();
281 }
282 }();
283 return type(tz, std::forward<U>(t));
284 }
285 };
286
287 template <translates_to_timestamp_without_timezone_layout T, class OPTION_FLAGS>
288 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
289 {
291
292 template <class U>
293 [[nodiscard]] static constexpr type create(U&& t)
294 {
295 return type(std::forward<U>(t));
296 }
297 };
298
299 template <translates_to_interval_layout T, class OPTION_FLAGS>
300 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
301 {
303
304 template <class U>
305 [[nodiscard]] static constexpr type create(U&& t)
306 {
307 return type(std::forward<U>(t));
308 }
309 };
310
311 template <translates_to_time_layout T, class OPTION_FLAGS>
312 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
313 {
315
316 template <class U>
317 static constexpr type create(U&& t)
318 {
319 return type(std::forward<U>(t));
320 }
321 };
322
323 template <translate_to_variable_sized_list_layout T, class OPTION_FLAGS>
324 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
325 {
326 using raw_value_type = std::ranges::range_value_t<T>;
327
328 using type = std::conditional_t<
332
333 template <class U>
334 [[nodiscard]] static type create(U&& t)
335 {
336 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
337
338 auto sizes = t
339 | std::views::transform(
340 [](const auto& l)
341 {
342 return get_size_save(l);
343 }
344 );
345
346 // when the raw_value_type is a "express layout desire" we need to
347 // propagate this information to the builder, so it can handle the
348 using layout_policy_type = layout_flag_t<raw_value_type>;
349 auto typed_array = build_impl<layout_policy_type>(flat_list_view, OPTION_FLAGS{});
350 auto detyped_array = array(std::move(typed_array));
351
352 return type(std::move(detyped_array), type::offset_from_sizes(sizes), where_null(t));
353 }
354 };
355
356 template <translate_to_fixed_sized_list_layout T, class OPTION_FLAGS>
357 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
358 {
360 static constexpr std::size_t
361 list_size = std::tuple_size_v<look_trough_t<std::ranges::range_value_t<T>>>;
362 using raw_value_type = std::ranges::range_value_t<T>;
363
364 template <class U>
365 [[nodiscard]] static type create(U&& t)
366 {
367 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
368
369 // when the raw_value_type is a "express layout desire" we need to
370 // propagate this information to the builder.
371 using layout_policy_type = layout_flag_t<raw_value_type>;
372
373 return type(
374 static_cast<std::uint64_t>(list_size),
375 array(build_impl<layout_policy_type>(flat_list_view, OPTION_FLAGS{})),
376 where_null(t)
377 );
378 }
379 };
380
381 template <translate_to_map_layout T, class OPTION_FLAGS>
382 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
383 {
385 using raw_value_type = std::ranges::range_value_t<T>;
386 using key_type = typename raw_value_type::first_type;
387 using value_type = typename raw_value_type::second_type;
388
389 template <class U>
390 [[nodiscard]] static type create(U&& t)
391 {
392 auto flat_keys = t
393 | std::views::transform(
394 [](const auto& kv)
395 {
396 return kv.first;
397 }
398 );
399 auto flat_items = t
400 | std::views::transform(
401 [](const auto& kv)
402 {
403 return kv.second;
404 }
405 );
406
407 // when the raw_value_type is a "express layout desire" we need to
408 // propagate this information to the builder, so it can handle the
409 using layout_policy_type = layout_flag_t<raw_value_type>;
410 auto keys_array = build_impl<layout_policy_type>(flat_keys, OPTION_FLAGS{});
411 auto items_array = build_impl<layout_policy_type>(flat_items, OPTION_FLAGS{});
412 auto offset = map_array::offset_from_sizes(sparrow::repeat_view<size_t>(1, std::ranges::size(t)));
413
414 return type(
415 sparrow::array{std::move(keys_array)},
416 sparrow::array{std::move(items_array)},
417 std::move(offset),
418 where_null(t)
419 );
420 }
421 };
422
423 template <translate_to_struct_layout T, class OPTION_FLAGS>
424 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
425 {
427 static constexpr std::size_t n_children = std::tuple_size_v<mnv_t<std::ranges::range_value_t<T>>>;
429
430 template <class U>
431 [[nodiscard]] static type create(U&& t)
432 {
433 std::vector<array> detyped_children(n_children);
435 [&](auto i)
436 {
437 auto tuple_i_col = t
438 | std::views::transform(
439 [](const auto& maybe_nullable_tuple)
440 {
441 const auto& tuple_val = ensure_value(maybe_nullable_tuple);
442 return std::get<decltype(i)::value>(tuple_val);
443 }
444 );
445
446 using tuple_element_type = std::tuple_element_t<decltype(i)::value, tuple_type>;
447 using layout_policy_type = layout_flag_t<tuple_element_type>;
448 detyped_children[decltype(i)::value] = array(
449 build_impl<layout_policy_type>(tuple_i_col, OPTION_FLAGS{})
450 );
451 }
452 );
453
454 return type(std::move(detyped_children), where_null(t));
455 }
456 };
457
458 template <translate_to_variable_sized_binary_layout T, class OPTION_FLAGS>
459 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
460 {
462
463 template <class U>
464 [[nodiscard]] static type create(U&& t)
465 {
466 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
467 u8_buffer<char> data_buffer(flat_list_view);
468
469 auto sizes = t
470 | std::views::transform(
471 [](const auto& l)
472 {
473 return get_size_save(l);
474 }
475 );
476
477 return type(std::move(data_buffer), type::offset_from_sizes(sizes), where_null(t));
478 }
479 };
480
481 template <translate_to_fixed_width_binary_layout T, class OPTION_FLAGS>
482 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
483 {
485
486 template <class U>
487 [[nodiscard]] static type create(U&& t)
488 {
489 return type(std::forward<U>(t));
490 }
491 };
492
493 template <translate_to_union_layout T, class OPTION_FLAGS>
494 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
495 {
496 using type = sparrow::sparse_union_array; // TODO use options to select between sparse and dense
497 using variant_type = std::ranges::range_value_t<T>;
498 static constexpr std::size_t variant_size = std::variant_size_v<variant_type>;
499
500 template <class U>
501 [[nodiscard]] static type create(U&& t)
502 requires(std::is_same_v<type, sparrow::sparse_union_array>)
503 {
504 std::vector<array> detyped_children(variant_size);
506 [&](auto i)
507 {
508 using type_at_index = std::variant_alternative_t<decltype(i)::value, variant_type>;
509 auto type_i_col = t
510 | std::views::transform(
511 [](const auto& variant)
512 {
513 return variant.index() == decltype(i)::value
514 ? std::get<type_at_index>(variant)
515 : type_at_index{};
516 }
517 );
518
519 using layout_policy_type = layout_flag_t<type_at_index>;
520 detyped_children[decltype(i)::value] = array(
521 build_impl<layout_policy_type>(type_i_col, OPTION_FLAGS{})
522 );
523 }
524 );
525
526 // type-ids
527 auto type_id_range = t
528 | std::views::transform(
529 [](const auto& v)
530 {
531 return static_cast<std::uint8_t>(v.index());
532 }
533 );
534 sparrow::u8_buffer<std::uint8_t> type_id_buffer(type_id_range);
535
536 return type(std::move(detyped_children), std::move(type_id_buffer));
537 }
538 };
539
540 template <class T, class OPTION_FLAGS>
541 struct builder<T, enforce_dict_encoded_layout, OPTION_FLAGS>
542 {
543 using key_type = std::uint64_t;
545 // keep the nulls
546 using raw_range_value_type = std::ranges::range_value_t<T>;
547
548 template <class U>
549 [[nodiscard]] static type create(U&& t)
550 {
551 const auto input_size = range_size(t);
552 key_type key = 0;
553 std::map<raw_range_value_type, key_type, nested_less<raw_range_value_type>> value_map;
554 std::vector<raw_range_value_type> values;
555 std::vector<key_type> keys;
556
557 values.reserve(input_size);
558 keys.reserve(input_size);
559
560 for (const auto& v : t)
561 {
562 auto find_res = value_map.find(v);
563 if (find_res == value_map.end())
564 {
565 value_map.insert({v, key});
566 values.push_back(v);
567 keys.push_back(key);
568 ++key;
569 }
570 else
571 {
572 keys.push_back(find_res->second);
573 }
574 }
575 auto keys_buffer = sparrow::u8_buffer<key_type>(keys);
576
577 // since we do not support dict[dict or dict[run_end
578 // we can hard code the layout policy here
579 using layout_policy_type = dont_enforce_layout;
580
581 auto values_array = build_impl<layout_policy_type>(values, OPTION_FLAGS{});
582
583 return type(std::move(keys_buffer), array(std::move(values_array)));
584 }
585 };
586
587 template <class T, class OPTION_FLAGS>
589 {
591 using raw_range_value_type = std::ranges::range_value_t<T>;
592
593 template <class U>
594 [[nodiscard]] static type create(U&& t)
595 {
596 using value_type = std::decay_t<raw_range_value_type>;
597
598 const auto input_size = range_size(t);
599
600 std::vector<value_type> values{};
601 std::vector<int64_t> acc_run_lengths{};
602
603 values.reserve(input_size);
604 acc_run_lengths.reserve(input_size);
605
606 const auto eq = nested_eq<value_type>{};
607
608 // accumulate the run lengths
609 int64_t i = 0;
610 for (const auto& v : t)
611 {
612 // first value
613 if (i == 0)
614 {
615 values.push_back(v);
616 }
617 // rest
618 else
619 {
620 if (!eq(values.back(), v))
621 {
622 acc_run_lengths.push_back(i);
623 values.push_back(v);
624 }
625 }
626 ++i;
627 }
628 acc_run_lengths.push_back(i);
629
630 auto run_length_typed_array = primitive_array<int64_t>(acc_run_lengths);
631
632 // since we do not support dict[dict or dict[run_end
633 // we can hard code the layout policy here
634 using layout_policy_type = dont_enforce_layout;
635 auto values_array = build_impl<layout_policy_type>(values, OPTION_FLAGS{});
636
637 return type(array(std::move(run_length_typed_array)), array(std::move(values_array)));
638 }
639 };
640 } // namespace detail
641} // namespace sparrow
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:39
static auto offset_from_sizes(SIZES_RANGE &&sizes) -> offset_buffer_type
A view that repeats a value a given number of times.
A sparse union array implementation.
This buffer class is use as storage buffer for all sparrow arrays.
Definition u8_buffer.hpp:75
std::conditional_t< is_dict_encode< mnv_t< T > >, enforce_dict_encoded_layout, std::conditional_t< is_run_end_encode< mnv_t< T > >, enforce_run_end_encoded_layout, dont_enforce_layout > > layout_flag_t
constexpr decltype(auto) ensure_value_range(T &&t) noexcept
look_trough_t< std::ranges::range_value_t< T > > ensured_range_value_t
ensured_range_value_t< ensured_range_value_t< T > > nested_ensured_range_inner_value_t
constexpr std::size_t get_size_save(T &&t)
constexpr auto build_impl(T &&t, sparrow::mpl::typelist< OPTION_FLAGS... > typelist)
Definition builder.hpp:117
constexpr std::vector< std::size_t > where_null(T &&t)
constexpr decltype(auto) ensure_value(T &&t)
void for_each_index(F &&f)
consteval bool any_of(L< T... >, Predicate predicate={})
Checks that at least one type in the provided list of is making the provided predicate return true.
Definition mp_utils.hpp:243
constexpr bool is_type_instance_of_v
true if T is a concrete type template instanciation of U which is a type template.
Definition mp_utils.hpp:49
consteval bool contains()
Definition mp_utils.hpp:349
std::byte byte_t
primitive_array_impl< T > time_array
Array of time values.
mpl:: typelist< chrono::time_seconds, chrono::time_milliseconds, chrono::time_microseconds, chrono::time_nanoseconds > time_types_t
list_array_impl< false > list_array
A list array implementation.
constexpr auto build(T &&t, OPTION_FLAGS &&...)
function to create a sparrow array from arbitrary nested combinations of ranges, tuples,...
Definition builder.hpp:80
primitive_array_impl< T > interval_array
Array of interval values.
mpl::typelist< zoned_time_without_timezone_seconds, zoned_time_without_timezone_milliseconds, zoned_time_without_timezone_microseconds, zoned_time_without_timezone_nanoseconds > zoned_time_without_timezone_types_t
mpl::typelist< date_days, date_milliseconds > date_types_t
variable_size_binary_array_impl< std::string, std::string_view, std::int32_t > string_array
A variable-size string array implementation.
constexpr std::size_t range_size(R &&r)
Definition ranges.hpp:32
constexpr large_list_flag_t large_list_flag
Definition builder.hpp:72
primitive_array_impl< T > date_array
Array of std::chrono::duration values.
primitive_array_impl< T > primitive_array
Array of values of whose type has fixed binary size.
date::zoned_time< Duration, TimeZonePtr > timestamp
fixed_width_binary_array_impl< fixed_width_binary_traits::value_type, fixed_width_binary_traits::const_reference > fixed_width_binary_array
A fixed-width binary array implementation.
primitive_array_impl< T > timestamp_without_timezone_array
Array of timestamps without timezone.
list_array_impl< true > big_list_array
A big list array implementation.
primitive_array_impl< T > duration_array
Array of std::chrono::duration values.
mpl::typelist< chrono::months, days_time_interval, month_day_nanoseconds_interval > interval_types_t
mpl:: typelist< std::chrono::seconds, std::chrono::milliseconds, std::chrono::microseconds, std::chrono::nanoseconds > duration_types_t
sparrow::primitive_array< ensured_range_value_t< T > > type
Definition builder.hpp:230
std::decay_t< decltype(std::declval< ensured_range_value_t< T > >().get_time_zone())> timezone_ptr
Definition builder.hpp:267
Workaround to replace static_assert(false) in template code.
Definition mp_utils.hpp:32
Compile-time type predicate: true if the evaluated type is the same as T.
Definition mp_utils.hpp:206
A sequence of types, used for meta-programming operations.
Definition mp_utils.hpp:54