sparrow 0.6.0
Loading...
Searching...
No Matches
builder.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15
16#pragma once
17
18#include <map>
19#include <ranges>
20#include <tuple>
21#include <type_traits>
22#include <utility>
23#include <vector>
24
25#include "sparrow/array.hpp"
40
41namespace sparrow
42{
43
44 // forward declaration
45 namespace detail
46 {
47 template <class T, class LAYOUT_POLICY, class OPTIONS_TYPE>
48 struct builder;
49 }
50
52 {
53 };
54
56 {
57 };
58
60 {
61 };
62
64 {
65 };
66
67 // option flag to indicate the desire for large lists
69
75 template <class T, class... OPTION_FLAGS>
76 [[nodiscard]] auto build(T&& t, OPTION_FLAGS&&...)
77 {
78 // for toplevel build calls, the layout policy is determined by the type itself
79 using decayed_t = std::decay_t<T>;
80 using layout_policy = detail::layout_flag_t<decayed_t>;
81 using option_flags_type = sparrow::mpl::typelist<std::decay_t<OPTION_FLAGS>...>;
82
84 {
85 // directely unpack
86 using value_type = typename decayed_t::value_type;
88 std::forward<T>(t).get()
89 );
90 }
91 else if constexpr (detail::is_nullable_like<T>)
92 {
93 static_assert(mpl::dependent_false<T>::value, "toplevel type must not be nullable");
94 }
95 else
96 {
98 }
99 }
100
101 template <class T, class... OPTION_FLAGS>
102 [[nodiscard]] auto build(std::initializer_list<T> t, OPTION_FLAGS&&... flags)
103 {
104 auto subranges = std::views::all(t);
105 return build(std::forward<decltype(subranges)>(subranges), std::forward<OPTION_FLAGS>(flags)...);
106 }
107
108 namespace detail
109 {
110 // this is called by the nested recursive calls
111 template <class LAYOUT_POLICY, class T, class... OPTION_FLAGS>
112 [[nodiscard]] auto build_impl(T&& t, [[maybe_unused]] sparrow::mpl::typelist<OPTION_FLAGS...> typelist)
113 {
114 using option_flags_type = sparrow::mpl::typelist<OPTION_FLAGS...>;
116 }
117
118 template <class T>
119 concept translates_to_primitive_layout = std::ranges::input_range<T>
120 && std::is_scalar_v<ensured_range_value_t<T>>;
121
122 template <typename T>
123 concept translates_to_date_layout = std::ranges::input_range<T>
124 && mpl::any_of(
125 date_types_t{},
127 );
128 template <typename T>
129 concept translates_to_duration_layout = std::ranges::input_range<T>
130 && mpl::any_of(
133 );
134 template <typename T>
135 concept translates_to_timestamp_layout = std::ranges::input_range<T>
137
138
139 template <typename T>
140 concept translates_to_interval_layout = std::ranges::input_range<T>
141 && mpl::any_of(
144 );
145
146 template <typename T>
147 concept translates_to_time_layout = std::ranges::input_range<T>
148 && mpl::any_of(
149 time_types_t{},
151 );
152
153 template <class T>
154 concept translate_to_variable_sized_list_layout = std::ranges::input_range<T>
155 && std::ranges::input_range<ensured_range_value_t<T>>
157 && // tuples go to struct layout
158 // value type of inner should not be 'char-like'(
159 // char, byte, uint8), these are handled by
160 // variable_size_binary_array
162
163 template <class T>
164 concept translate_to_struct_layout = std::ranges::input_range<T> && tuple_like<ensured_range_value_t<T>>
166
167 template <class T>
169 && std::is_same_v<std::ranges::range_value_t<T>, byte_t>;
170
171 template <class T>
172 concept translate_to_fixed_sized_list_layout = std::ranges::input_range<T>
174 && !(
178 )
180
181 template <class T>
182 concept translate_to_variable_sized_binary_layout = std::ranges::input_range<T>
183 && std::ranges::input_range<ensured_range_value_t<T>>
184 && !(
188 )
190 && // tuples go to struct layout
191 // value type of inner must be char like ( char,
192 // byte, uint8)
194
195 template <class T>
196 concept translate_to_fixed_width_binary_layout = std::ranges::input_range<T>
200 );
201
202 template <class T>
203 concept translate_to_union_layout = std::ranges::input_range<T> &&
204 // value type must be a variant-like type
205 // *NOTE* we don't check for nullable here, as we want to handle
206 // nullable variants as in the arrow spec, the nulls are handled
207 // by the elements **in** the variant
209
210 template <translates_to_primitive_layout T, class OPTION_FLAGS>
211 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
212 {
214
215 template <class U>
216 [[nodiscard]] static type create(U&& t)
217 {
218 return type(std::forward<U>(t));
219 }
220 };
221
222 template <translates_to_date_layout T, class OPTION_FLAGS>
223 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
224 {
226
227 template <class U>
228 static type create(U&& t)
229 {
230 return type(std::forward<U>(t));
231 }
232 };
233
234 template <translates_to_duration_layout T, class OPTION_FLAGS>
235 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
236 {
238
239 template <class U>
240 [[nodiscard]] static type create(U&& t)
241 {
242 return type(std::forward<U>(t));
243 }
244 };
245
246 template <translates_to_timestamp_layout T, class OPTION_FLAGS>
247 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
248 {
250 using timezone_ptr = std::decay_t<decltype(std::declval<ensured_range_value_t<T>>().get_time_zone())>;
251
252 template <class U>
253 [[nodiscard]] static type create(U&& t)
254 {
255 timezone_ptr tz = [&t]() -> timezone_ptr
256 {
257 if (t.empty())
258 {
259 return nullptr;
260 }
261 else
262 {
263 return t.begin()->get_time_zone();
264 }
265 }();
266 return type(tz, std::forward<U>(t));
267 }
268 };
269
270 template <translates_to_interval_layout T, class OPTION_FLAGS>
271 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
272 {
274
275 template <class U>
276 [[nodiscard]] static type create(U&& t)
277 {
278 return type(std::forward<U>(t));
279 }
280 };
281
282 template <translates_to_time_layout T, class OPTION_FLAGS>
283 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
284 {
286
287 template <class U>
288 static type create(U&& t)
289 {
290 return type(std::forward<U>(t));
291 }
292 };
293
294 template <translate_to_variable_sized_list_layout T, class OPTION_FLAGS>
295 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
296 {
297 using raw_value_type = std::ranges::range_value_t<T>;
298
299 using type = std::conditional_t<
300 mpl::contains<large_list_flag_t>(OPTION_FLAGS{}),
303
304 template <class U>
305 [[nodiscard]] static type create(U&& t)
306 {
307 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
308
309 auto sizes = t
310 | std::views::transform(
311 [](const auto& l)
312 {
313 return get_size_save(l);
314 }
315 );
316
317 // when the raw_value_type is a "express layout desire" we need to
318 // propagate this information to the builder, so it can handle the
319 using layout_policy_type = layout_flag_t<raw_value_type>;
320 auto typed_array = build_impl<layout_policy_type>(flat_list_view, OPTION_FLAGS{});
321 auto detyped_array = array(std::move(typed_array));
322
323 return type(std::move(detyped_array), type::offset_from_sizes(sizes), where_null(t));
324 }
325 };
326
327 template <translate_to_fixed_sized_list_layout T, class OPTION_FLAGS>
328 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
329 {
331 static constexpr std::size_t
332 list_size = std::tuple_size_v<look_trough_t<std::ranges::range_value_t<T>>>;
333 using raw_value_type = std::ranges::range_value_t<T>;
334
335 template <class U>
336 [[nodiscard]] static type create(U&& t)
337 {
338 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
339
340 // when the raw_value_type is a "express layout desire" we need to
341 // propagate this information to the builder.
342 using layout_policy_type = layout_flag_t<raw_value_type>;
343
344 return type(
345 static_cast<std::uint64_t>(list_size),
346 array(build_impl<layout_policy_type>(flat_list_view, OPTION_FLAGS{})),
347 where_null(t)
348 );
349 }
350 };
351
352 template <translate_to_struct_layout T, class OPTION_FLAGS>
353 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
354 {
356 static constexpr std::size_t n_children = std::tuple_size_v<mnv_t<std::ranges::range_value_t<T>>>;
358
359 template <class U>
360 [[nodiscard]] static type create(U&& t)
361 {
362 std::vector<array> detyped_children(n_children);
364 [&](auto i)
365 {
366 auto tuple_i_col = t
367 | std::views::transform(
368 [](const auto& maybe_nullable_tuple)
369 {
370 const auto& tuple_val = ensure_value(maybe_nullable_tuple);
371 return std::get<decltype(i)::value>(tuple_val);
372 }
373 );
374
375 using tuple_element_type = std::tuple_element_t<decltype(i)::value, tuple_type>;
376 using layout_policy_type = layout_flag_t<tuple_element_type>;
377 detyped_children[decltype(i)::value] = array(
378 build_impl<layout_policy_type>(tuple_i_col, OPTION_FLAGS{})
379 );
380 }
381 );
382
383 return type(std::move(detyped_children), where_null(t));
384 }
385 };
386
387 template <translate_to_variable_sized_binary_layout T, class OPTION_FLAGS>
388 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
389 {
391
392 template <class U>
393 [[nodiscard]] static type create(U&& t)
394 {
395 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
396 u8_buffer<char> data_buffer(flat_list_view);
397
398 auto sizes = t
399 | std::views::transform(
400 [](const auto& l)
401 {
402 return get_size_save(l);
403 }
404 );
405
406 return type(std::move(data_buffer), type::offset_from_sizes(sizes), where_null(t));
407 }
408 };
409
410 template <translate_to_fixed_width_binary_layout T, class OPTION_FLAGS>
411 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
412 {
414
415 template <class U>
416 [[nodiscard]] static type create(U&& t)
417 {
418 return type(std::move(t));
419 }
420 };
421
422 template <translate_to_union_layout T, class OPTION_FLAGS>
423 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
424 {
425 using type = sparrow::sparse_union_array; // TODO use options to select between sparse and dense
426 using variant_type = std::ranges::range_value_t<T>;
427 static constexpr std::size_t variant_size = std::variant_size_v<variant_type>;
428
429 template <class U>
430 [[nodiscard]] static type create(U&& t)
431 requires(std::is_same_v<type, sparrow::sparse_union_array>)
432 {
433 std::vector<array> detyped_children(variant_size);
435 [&](auto i)
436 {
437 using type_at_index = std::variant_alternative_t<decltype(i)::value, variant_type>;
438 auto type_i_col = t
439 | std::views::transform(
440 [](const auto& variant)
441 {
442 return variant.index() == decltype(i)::value
443 ? std::get<type_at_index>(variant)
444 : type_at_index{};
445 }
446 );
447
448 using layout_policy_type = layout_flag_t<type_at_index>;
449 detyped_children[decltype(i)::value] = array(
450 build_impl<layout_policy_type>(type_i_col, OPTION_FLAGS{})
451 );
452 }
453 );
454
455 // type-ids
456 auto type_id_range = t
457 | std::views::transform(
458 [](const auto& v)
459 {
460 return static_cast<std::uint8_t>(v.index());
461 }
462 );
463 sparrow::u8_buffer<std::uint8_t> type_id_buffer(type_id_range);
464
465 return type(std::move(detyped_children), std::move(type_id_buffer));
466 }
467 };
468
469 template <class T, class OPTION_FLAGS>
470 struct builder<T, enforce_dict_encoded_layout, OPTION_FLAGS>
471 {
472 using key_type = std::uint64_t;
474 // keep the nulls
475 using raw_range_value_type = std::ranges::range_value_t<T>;
476
477 template <class U>
478 [[nodiscard]] static type create(U&& t)
479 {
480 const auto input_size = range_size(t);
481 key_type key = 0;
482 std::map<raw_range_value_type, key_type, nested_less<raw_range_value_type>> value_map;
483 std::vector<raw_range_value_type> values;
484 std::vector<key_type> keys;
485
486 values.reserve(input_size);
487 keys.reserve(input_size);
488
489 for (const auto& v : t)
490 {
491 auto find_res = value_map.find(v);
492 if (find_res == value_map.end())
493 {
494 value_map.insert({v, key});
495 values.push_back(v);
496 keys.push_back(key);
497 ++key;
498 }
499 else
500 {
501 keys.push_back(find_res->second);
502 }
503 }
504 auto keys_buffer = sparrow::u8_buffer<key_type>(keys);
505
506 // since we do not support dict[dict or dict[run_end
507 // we can hard code the layout policy here
508 using layout_policy_type = dont_enforce_layout;
509
510 auto values_array = build_impl<layout_policy_type>(values, OPTION_FLAGS{});
511
512 return type(std::move(keys_buffer), array(std::move(values_array)));
513 }
514 };
515
516 template <class T, class OPTION_FLAGS>
518 {
520 using raw_range_value_type = std::ranges::range_value_t<T>;
521
522 template <class U>
523 [[nodiscard]] static type create(U&& t)
524 {
525 using value_type = std::decay_t<raw_range_value_type>;
526
527 const auto input_size = range_size(t);
528
529 std::vector<value_type> values{};
530 std::vector<std::size_t> acc_run_lengths{};
531
532 values.reserve(input_size);
533 acc_run_lengths.reserve(input_size);
534
535 const auto eq = nested_eq<value_type>{};
536
537 // accumulate the run lengths
538 std::size_t i = 0;
539 for (const auto& v : t)
540 {
541 // first value
542 if (i == 0)
543 {
544 values.push_back(v);
545 }
546 // rest
547 else
548 {
549 if (!eq(values.back(), v))
550 {
551 acc_run_lengths.push_back(i);
552 values.push_back(v);
553 }
554 }
555 ++i;
556 }
557 acc_run_lengths.push_back(i);
558
559 auto run_length_typed_array = primitive_array<std::size_t>(acc_run_lengths);
560
561 // since we do not support dict[dict or dict[run_end
562 // we can hard code the layout policy here
563 using layout_policy_type = dont_enforce_layout;
564 auto values_array = build_impl<layout_policy_type>(values, OPTION_FLAGS{});
565
566 return type(array(std::move(run_length_typed_array)), array(std::move(values_array)));
567 }
568 };
569 } // namespace detail
570} // namespace sparrow
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:39
This buffer class is use as storage buffer for all sparrow arrays.
Definition u8_buffer.hpp:75
std::conditional_t< is_dict_encode< mnv_t< T > >, enforce_dict_encoded_layout, std::conditional_t< is_run_end_encode< mnv_t< T > >, enforce_run_end_encoded_layout, dont_enforce_layout > > layout_flag_t
look_trough_t< std::ranges::range_value_t< T > > ensured_range_value_t
ensured_range_value_t< ensured_range_value_t< T > > nested_ensured_range_inner_value_t
decltype(auto) ensure_value_range(T &&t)
std::vector< std::size_t > where_null(T &&t)
decltype(auto) ensure_value(T &&t)
void for_each_index(F &&f)
auto build_impl(T &&t, sparrow::mpl::typelist< OPTION_FLAGS... > typelist)
Definition builder.hpp:112
std::size_t get_size_save(T &&t)
consteval bool contains(L list)
Definition mp_utils.hpp:285
consteval bool any_of(L< T... >, Predicate predicate={})
Checks that at least one type in the provided list of is making the provide predicate return true.
Definition mp_utils.hpp:244
constexpr bool is_type_instance_of_v
true if T is a concrete type template instanciation of U which is a type template.
Definition mp_utils.hpp:50
std::byte byte_t
primitive_array_impl< T > time_array
Array of time values.
mpl:: typelist< chrono::time_seconds, chrono::time_milliseconds, chrono::time_microseconds, chrono::time_nanoseconds > time_types_t
list_array_impl< false > list_array
primitive_array_impl< T > interval_array
Array of interval values.
auto build(T &&t, OPTION_FLAGS &&...)
function to create a sparrow array from arbitrary nested combinations of ranges, tuples,...
Definition builder.hpp:76
mpl::typelist< date_days, date_milliseconds > date_types_t
variable_size_binary_array_impl< std::string, std::string_view, std::int32_t > string_array
constexpr large_list_flag_t large_list_flag
Definition builder.hpp:68
primitive_array_impl< T > date_array
Array of std::chrono::duration values.
primitive_array_impl< T > primitive_array
Array of values of whose type has fixed binary size.
date::zoned_time< Duration, TimeZonePtr > timestamp
fixed_width_binary_array_impl< fixed_width_binary_traits::value_type, fixed_width_binary_traits::const_reference > fixed_width_binary_array
list_array_impl< true > big_list_array
primitive_array_impl< T > duration_array
Array of std::chrono::duration values.
mpl::typelist< chrono::months, days_time_interval, month_day_nanoseconds_interval > interval_types_t
std::size_t range_size(R &&r)
Definition ranges.hpp:33
mpl:: typelist< std::chrono::seconds, std::chrono::milliseconds, std::chrono::microseconds, std::chrono::nanoseconds > duration_types_t
sparrow::primitive_array< ensured_range_value_t< T > > type
Definition builder.hpp:213
std::decay_t< decltype(std::declval< ensured_range_value_t< T > >().get_time_zone())> timezone_ptr
Definition builder.hpp:250
Workaround to replace static_assert(false) in template code.
Definition mp_utils.hpp:33
Compile-time type predicate: true if the evaluated type is the same as T.
Definition mp_utils.hpp:207
A sequence of types, used for meta-programming operations.
Definition mp_utils.hpp:55