sparrow 0.9.0
Loading...
Searching...
No Matches
builder.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15
16#pragma once
17
18#include <map>
19#include <ranges>
20#include <tuple>
21#include <type_traits>
22#include <utility>
23#include <vector>
24
25#include "sparrow/array.hpp"
41
42namespace sparrow
43{
44
45 // forward declaration
46 namespace detail
47 {
48 template <class T, class LAYOUT_POLICY, class OPTIONS_TYPE>
49 struct builder;
50 }
51
53 {
54 };
55
57 {
58 };
59
61 {
62 };
63
65 {
66 };
67
68 // option flag to indicate the desire for large lists
70
76 template <class T, class... OPTION_FLAGS>
77 [[nodiscard]] auto build(T&& t, OPTION_FLAGS&&...)
78 {
79 // for toplevel build calls, the layout policy is determined by the type itself
80 using decayed_t = std::decay_t<T>;
81 using layout_policy = detail::layout_flag_t<decayed_t>;
82 using option_flags_type = sparrow::mpl::typelist<std::decay_t<OPTION_FLAGS>...>;
83
85 {
86 // directely unpack
87 using value_type = typename decayed_t::value_type;
89 std::forward<T>(t).get()
90 );
91 }
92 else if constexpr (detail::is_nullable_like<T>)
93 {
94 static_assert(mpl::dependent_false<T>::value, "toplevel type must not be nullable");
95 }
96 else
97 {
99 }
100 }
101
102 template <class T, class... OPTION_FLAGS>
103 [[nodiscard]] auto build(std::initializer_list<T> t, OPTION_FLAGS&&... flags)
104 {
105 auto subranges = std::views::all(t);
106 return build(std::forward<decltype(subranges)>(subranges), std::forward<OPTION_FLAGS>(flags)...);
107 }
108
109 namespace detail
110 {
111 // this is called by the nested recursive calls
112 template <class LAYOUT_POLICY, class T, class... OPTION_FLAGS>
113 [[nodiscard]] auto build_impl(T&& t, [[maybe_unused]] sparrow::mpl::typelist<OPTION_FLAGS...> typelist)
114 {
115 using option_flags_type = sparrow::mpl::typelist<OPTION_FLAGS...>;
117 }
118
119 template <class T>
120 concept translates_to_primitive_layout = std::ranges::input_range<T>
121 && std::is_scalar_v<ensured_range_value_t<T>>;
122
123 template <typename T>
124 concept translates_to_date_layout = std::ranges::input_range<T>
125 && mpl::any_of(
126 date_types_t{},
128 );
129 template <typename T>
130 concept translates_to_duration_layout = std::ranges::input_range<T>
131 && mpl::any_of(
134 );
135 template <typename T>
136 concept translates_to_timestamp_layout = std::ranges::input_range<T>
138
139 template <typename T>
140 concept translates_to_timestamp_without_timezone_layout = std::ranges::input_range<T>
141 && mpl::any_of(
145 );
146
147
148 template <typename T>
149 concept translates_to_interval_layout = std::ranges::input_range<T>
150 && mpl::any_of(
153 );
154
155 template <typename T>
156 concept translates_to_time_layout = std::ranges::input_range<T>
157 && mpl::any_of(
158 time_types_t{},
160 );
161
162 template <class T>
163 concept translate_to_variable_sized_list_layout = std::ranges::input_range<T>
164 && std::ranges::input_range<ensured_range_value_t<T>>
166 && // tuples go to struct layout
167 // value type of inner should not be 'char-like'(
168 // char, byte, uint8), these are handled by
169 // variable_size_binary_array
171
172 template <class T>
173 concept translate_to_struct_layout = std::ranges::input_range<T> && tuple_like<ensured_range_value_t<T>>
175
176 template <class T>
178 && std::is_same_v<std::ranges::range_value_t<T>, byte_t>;
179
180 template <class T>
181 concept translate_to_fixed_sized_list_layout = std::ranges::input_range<T>
183 && !(
187 )
189
190 template <class T>
191 concept translate_to_variable_sized_binary_layout = std::ranges::input_range<T>
192 && std::ranges::input_range<ensured_range_value_t<T>>
193 && !(
197 )
199 && // tuples go to struct layout
200 // value type of inner must be char like ( char,
201 // byte, uint8)
203
204 template <class T>
205 concept translate_to_fixed_width_binary_layout = std::ranges::input_range<T>
209
210 template <class T>
211 concept translate_to_union_layout = std::ranges::input_range<T> &&
212 // value type must be a variant-like type
213 // *NOTE* we don't check for nullable here, as we want to handle
214 // nullable variants as in the arrow spec, the nulls are handled
215 // by the elements **in** the variant
217
218 template <translates_to_primitive_layout T, class OPTION_FLAGS>
219 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
220 {
222
223 template <class U>
224 [[nodiscard]] static type create(U&& t)
225 {
226 return type(std::forward<U>(t));
227 }
228 };
229
230 template <translates_to_date_layout T, class OPTION_FLAGS>
231 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
232 {
234
235 template <class U>
236 static type create(U&& t)
237 {
238 return type(std::forward<U>(t));
239 }
240 };
241
242 template <translates_to_duration_layout T, class OPTION_FLAGS>
243 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
244 {
246
247 template <class U>
248 [[nodiscard]] static type create(U&& t)
249 {
250 return type(std::forward<U>(t));
251 }
252 };
253
254 template <translates_to_timestamp_layout T, class OPTION_FLAGS>
255 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
256 {
258 using timezone_ptr = std::decay_t<decltype(std::declval<ensured_range_value_t<T>>().get_time_zone())>;
259
260 template <class U>
261 [[nodiscard]] static type create(U&& t)
262 {
263 timezone_ptr tz = [&t]() -> timezone_ptr
264 {
265 if (t.empty())
266 {
267 return nullptr;
268 }
269 else
270 {
271 return t.begin()->get_time_zone();
272 }
273 }();
274 return type(tz, std::forward<U>(t));
275 }
276 };
277
278 template <translates_to_timestamp_without_timezone_layout T, class OPTION_FLAGS>
279 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
280 {
282
283 template <class U>
284 [[nodiscard]] static type create(U&& t)
285 {
286 return type(std::forward<U>(t));
287 }
288 };
289
290 template <translates_to_interval_layout T, class OPTION_FLAGS>
291 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
292 {
294
295 template <class U>
296 [[nodiscard]] static type create(U&& t)
297 {
298 return type(std::forward<U>(t));
299 }
300 };
301
302 template <translates_to_time_layout T, class OPTION_FLAGS>
303 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
304 {
306
307 template <class U>
308 static type create(U&& t)
309 {
310 return type(std::forward<U>(t));
311 }
312 };
313
314 template <translate_to_variable_sized_list_layout T, class OPTION_FLAGS>
315 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
316 {
317 using raw_value_type = std::ranges::range_value_t<T>;
318
319 using type = std::conditional_t<
320 mpl::contains<large_list_flag_t>(OPTION_FLAGS{}),
323
324 template <class U>
325 [[nodiscard]] static type create(U&& t)
326 {
327 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
328
329 auto sizes = t
330 | std::views::transform(
331 [](const auto& l)
332 {
333 return get_size_save(l);
334 }
335 );
336
337 // when the raw_value_type is a "express layout desire" we need to
338 // propagate this information to the builder, so it can handle the
339 using layout_policy_type = layout_flag_t<raw_value_type>;
340 auto typed_array = build_impl<layout_policy_type>(flat_list_view, OPTION_FLAGS{});
341 auto detyped_array = array(std::move(typed_array));
342
343 return type(std::move(detyped_array), type::offset_from_sizes(sizes), where_null(t));
344 }
345 };
346
347 template <translate_to_fixed_sized_list_layout T, class OPTION_FLAGS>
348 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
349 {
351 static constexpr std::size_t
352 list_size = std::tuple_size_v<look_trough_t<std::ranges::range_value_t<T>>>;
353 using raw_value_type = std::ranges::range_value_t<T>;
354
355 template <class U>
356 [[nodiscard]] static type create(U&& t)
357 {
358 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
359
360 // when the raw_value_type is a "express layout desire" we need to
361 // propagate this information to the builder.
362 using layout_policy_type = layout_flag_t<raw_value_type>;
363
364 return type(
365 static_cast<std::uint64_t>(list_size),
366 array(build_impl<layout_policy_type>(flat_list_view, OPTION_FLAGS{})),
367 where_null(t)
368 );
369 }
370 };
371
372 template <translate_to_struct_layout T, class OPTION_FLAGS>
373 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
374 {
376 static constexpr std::size_t n_children = std::tuple_size_v<mnv_t<std::ranges::range_value_t<T>>>;
378
379 template <class U>
380 [[nodiscard]] static type create(U&& t)
381 {
382 std::vector<array> detyped_children(n_children);
384 [&](auto i)
385 {
386 auto tuple_i_col = t
387 | std::views::transform(
388 [](const auto& maybe_nullable_tuple)
389 {
390 const auto& tuple_val = ensure_value(maybe_nullable_tuple);
391 return std::get<decltype(i)::value>(tuple_val);
392 }
393 );
394
395 using tuple_element_type = std::tuple_element_t<decltype(i)::value, tuple_type>;
396 using layout_policy_type = layout_flag_t<tuple_element_type>;
397 detyped_children[decltype(i)::value] = array(
398 build_impl<layout_policy_type>(tuple_i_col, OPTION_FLAGS{})
399 );
400 }
401 );
402
403 return type(std::move(detyped_children), where_null(t));
404 }
405 };
406
407 template <translate_to_variable_sized_binary_layout T, class OPTION_FLAGS>
408 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
409 {
411
412 template <class U>
413 [[nodiscard]] static type create(U&& t)
414 {
415 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
416 u8_buffer<char> data_buffer(flat_list_view);
417
418 auto sizes = t
419 | std::views::transform(
420 [](const auto& l)
421 {
422 return get_size_save(l);
423 }
424 );
425
426 return type(std::move(data_buffer), type::offset_from_sizes(sizes), where_null(t));
427 }
428 };
429
430 template <translate_to_fixed_width_binary_layout T, class OPTION_FLAGS>
431 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
432 {
434
435 template <class U>
436 [[nodiscard]] static type create(U&& t)
437 {
438 return type(std::move(t));
439 }
440 };
441
442 template <translate_to_union_layout T, class OPTION_FLAGS>
443 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
444 {
445 using type = sparrow::sparse_union_array; // TODO use options to select between sparse and dense
446 using variant_type = std::ranges::range_value_t<T>;
447 static constexpr std::size_t variant_size = std::variant_size_v<variant_type>;
448
449 template <class U>
450 [[nodiscard]] static type create(U&& t)
451 requires(std::is_same_v<type, sparrow::sparse_union_array>)
452 {
453 std::vector<array> detyped_children(variant_size);
455 [&](auto i)
456 {
457 using type_at_index = std::variant_alternative_t<decltype(i)::value, variant_type>;
458 auto type_i_col = t
459 | std::views::transform(
460 [](const auto& variant)
461 {
462 return variant.index() == decltype(i)::value
463 ? std::get<type_at_index>(variant)
464 : type_at_index{};
465 }
466 );
467
468 using layout_policy_type = layout_flag_t<type_at_index>;
469 detyped_children[decltype(i)::value] = array(
470 build_impl<layout_policy_type>(type_i_col, OPTION_FLAGS{})
471 );
472 }
473 );
474
475 // type-ids
476 auto type_id_range = t
477 | std::views::transform(
478 [](const auto& v)
479 {
480 return static_cast<std::uint8_t>(v.index());
481 }
482 );
483 sparrow::u8_buffer<std::uint8_t> type_id_buffer(type_id_range);
484
485 return type(std::move(detyped_children), std::move(type_id_buffer));
486 }
487 };
488
489 template <class T, class OPTION_FLAGS>
490 struct builder<T, enforce_dict_encoded_layout, OPTION_FLAGS>
491 {
492 using key_type = std::uint64_t;
494 // keep the nulls
495 using raw_range_value_type = std::ranges::range_value_t<T>;
496
497 template <class U>
498 [[nodiscard]] static type create(U&& t)
499 {
500 const auto input_size = range_size(t);
501 key_type key = 0;
502 std::map<raw_range_value_type, key_type, nested_less<raw_range_value_type>> value_map;
503 std::vector<raw_range_value_type> values;
504 std::vector<key_type> keys;
505
506 values.reserve(input_size);
507 keys.reserve(input_size);
508
509 for (const auto& v : t)
510 {
511 auto find_res = value_map.find(v);
512 if (find_res == value_map.end())
513 {
514 value_map.insert({v, key});
515 values.push_back(v);
516 keys.push_back(key);
517 ++key;
518 }
519 else
520 {
521 keys.push_back(find_res->second);
522 }
523 }
524 auto keys_buffer = sparrow::u8_buffer<key_type>(keys);
525
526 // since we do not support dict[dict or dict[run_end
527 // we can hard code the layout policy here
528 using layout_policy_type = dont_enforce_layout;
529
530 auto values_array = build_impl<layout_policy_type>(values, OPTION_FLAGS{});
531
532 return type(std::move(keys_buffer), array(std::move(values_array)));
533 }
534 };
535
536 template <class T, class OPTION_FLAGS>
538 {
540 using raw_range_value_type = std::ranges::range_value_t<T>;
541
542 template <class U>
543 [[nodiscard]] static type create(U&& t)
544 {
545 using value_type = std::decay_t<raw_range_value_type>;
546
547 const auto input_size = range_size(t);
548
549 std::vector<value_type> values{};
550 std::vector<int64_t> acc_run_lengths{};
551
552 values.reserve(input_size);
553 acc_run_lengths.reserve(input_size);
554
555 const auto eq = nested_eq<value_type>{};
556
557 // accumulate the run lengths
558 int64_t i = 0;
559 for (const auto& v : t)
560 {
561 // first value
562 if (i == 0)
563 {
564 values.push_back(v);
565 }
566 // rest
567 else
568 {
569 if (!eq(values.back(), v))
570 {
571 acc_run_lengths.push_back(i);
572 values.push_back(v);
573 }
574 }
575 ++i;
576 }
577 acc_run_lengths.push_back(i);
578
579 auto run_length_typed_array = primitive_array<int64_t>(acc_run_lengths);
580
581 // since we do not support dict[dict or dict[run_end
582 // we can hard code the layout policy here
583 using layout_policy_type = dont_enforce_layout;
584 auto values_array = build_impl<layout_policy_type>(values, OPTION_FLAGS{});
585
586 return type(array(std::move(run_length_typed_array)), array(std::move(values_array)));
587 }
588 };
589 } // namespace detail
590} // namespace sparrow
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:39
This buffer class is use as storage buffer for all sparrow arrays.
Definition u8_buffer.hpp:75
std::conditional_t< is_dict_encode< mnv_t< T > >, enforce_dict_encoded_layout, std::conditional_t< is_run_end_encode< mnv_t< T > >, enforce_run_end_encoded_layout, dont_enforce_layout > > layout_flag_t
look_trough_t< std::ranges::range_value_t< T > > ensured_range_value_t
ensured_range_value_t< ensured_range_value_t< T > > nested_ensured_range_inner_value_t
decltype(auto) ensure_value_range(T &&t)
std::vector< std::size_t > where_null(T &&t)
decltype(auto) ensure_value(T &&t)
void for_each_index(F &&f)
auto build_impl(T &&t, sparrow::mpl::typelist< OPTION_FLAGS... > typelist)
Definition builder.hpp:113
std::size_t get_size_save(T &&t)
consteval bool contains(L list)
Definition mp_utils.hpp:285
consteval bool any_of(L< T... >, Predicate predicate={})
Checks that at least one type in the provided list of is making the provide predicate return true.
Definition mp_utils.hpp:244
constexpr bool is_type_instance_of_v
true if T is a concrete type template instanciation of U which is a type template.
Definition mp_utils.hpp:50
std::byte byte_t
primitive_array_impl< T > time_array
Array of time values.
mpl:: typelist< chrono::time_seconds, chrono::time_milliseconds, chrono::time_microseconds, chrono::time_nanoseconds > time_types_t
list_array_impl< false > list_array
primitive_array_impl< T > interval_array
Array of interval values.
auto build(T &&t, OPTION_FLAGS &&...)
function to create a sparrow array from arbitrary nested combinations of ranges, tuples,...
Definition builder.hpp:77
mpl::typelist< zoned_time_without_timezone_seconds, zoned_time_without_timezone_milliseconds, zoned_time_without_timezone_microseconds, zoned_time_without_timezone_nanoseconds > zoned_time_without_timezone_types_t
mpl::typelist< date_days, date_milliseconds > date_types_t
variable_size_binary_array_impl< std::string, std::string_view, std::int32_t > string_array
constexpr large_list_flag_t large_list_flag
Definition builder.hpp:69
primitive_array_impl< T > date_array
Array of std::chrono::duration values.
primitive_array_impl< T > primitive_array
Array of values of whose type has fixed binary size.
date::zoned_time< Duration, TimeZonePtr > timestamp
fixed_width_binary_array_impl< fixed_width_binary_traits::value_type, fixed_width_binary_traits::const_reference > fixed_width_binary_array
primitive_array_impl< T > timestamp_without_timezone_array
Array of std::chrono::duration values.
list_array_impl< true > big_list_array
primitive_array_impl< T > duration_array
Array of std::chrono::duration values.
mpl::typelist< chrono::months, days_time_interval, month_day_nanoseconds_interval > interval_types_t
std::size_t range_size(R &&r)
Definition ranges.hpp:31
mpl:: typelist< std::chrono::seconds, std::chrono::milliseconds, std::chrono::microseconds, std::chrono::nanoseconds > duration_types_t
sparrow::primitive_array< ensured_range_value_t< T > > type
Definition builder.hpp:221
std::decay_t< decltype(std::declval< ensured_range_value_t< T > >().get_time_zone())> timezone_ptr
Definition builder.hpp:258
Workaround to replace static_assert(false) in template code.
Definition mp_utils.hpp:33
Compile-time type predicate: true if the evaluated type is the same as T.
Definition mp_utils.hpp:207
A sequence of types, used for meta-programming operations.
Definition mp_utils.hpp:55