sparrow 0.3.0
Loading...
Searching...
No Matches
builder.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15
16#pragma once
17
18#include <map>
19#include <ranges>
20#include <tuple>
21#include <type_traits>
22#include <utility>
23#include <vector>
24
25#include "sparrow/array.hpp"
40
41namespace sparrow
42{
43
44 // forward declaration
45 namespace detail
46 {
47 template <class T, class LAYOUT_POLICY, class OPTIONS_TYPE>
48 struct builder;
49 }
50
52 {
53 };
54
56 {
57 };
58
60 {
61 };
62
64 {
65 };
66
67 // option flag to indicate the desire for large lists
69
75 template <class T, class... OPTION_FLAGS>
76 [[nodiscard]] auto build(T&& t, OPTION_FLAGS&&...)
77 {
78 // for toplevel build calls, the layout policy is determined by the type itself
79 using decayed_t = std::decay_t<T>;
80 using layout_policy = detail::layout_flag_t<decayed_t>;
81 using option_flags_type = sparrow::mpl::typelist<std::decay_t<OPTION_FLAGS>...>;
82
84 {
85 // directely unpack
86 using value_type = typename decayed_t::value_type;
88 std::forward<T>(t).get()
89 );
90 }
91 else if constexpr (detail::is_nullable_like<T>)
92 {
93 static_assert(mpl::dependent_false<T>::value, "toplevel type must not be nullable");
94 }
95 else
96 {
98 }
99 }
100
101 template <class T, class... OPTION_FLAGS>
102 [[nodiscard]] auto build(std::initializer_list<T> t, OPTION_FLAGS&&... flags)
103 {
104 auto subranges = std::views::all(t);
105 return build(std::forward<decltype(subranges)>(subranges), std::forward<OPTION_FLAGS>(flags)...);
106 }
107
108 namespace detail
109 {
110 // this is called by the nested recursive calls
111 template <class LAYOUT_POLICY, class T, class... OPTION_FLAGS>
112 [[nodiscard]] auto build_impl(T&& t, [[maybe_unused]] sparrow::mpl::typelist<OPTION_FLAGS...> typelist)
113 {
114 using option_flags_type = sparrow::mpl::typelist<OPTION_FLAGS...>;
116 }
117
118 template <class T>
119 concept translates_to_primitive_layout = std::ranges::input_range<T>
120 && std::is_scalar_v<ensured_range_value_t<T>>;
121
122 template <typename T>
123 concept translates_to_date_layout = std::ranges::input_range<T>
124 && mpl::any_of(
125 date_types_t{},
127 );
128 template <typename T>
129 concept translates_to_duration_layout = std::ranges::input_range<T>
130 && mpl::any_of(
133 );
134 template <typename T>
135 concept translates_to_timestamp_layout = std::ranges::input_range<T>
137
138
139 template <typename T>
140 concept translates_to_interval_layout = std::ranges::input_range<T>
141 && mpl::any_of(
144 );
145
146 template <typename T>
147 concept translates_to_time_layout = std::ranges::input_range<T>
148 && mpl::any_of(
149 time_types_t{},
151 );
152
153 template <class T>
154 concept translate_to_variable_sized_list_layout = std::ranges::input_range<T>
155 && std::ranges::input_range<ensured_range_value_t<T>>
157 && // tuples go to struct layout
158 // value type of inner should not be 'char-like'(
159 // char, byte, uint8), these are handled by
160 // variable_size_binary_array
162
163 template <class T>
164 concept translate_to_struct_layout = std::ranges::input_range<T> && tuple_like<ensured_range_value_t<T>>
166
167 template <class T>
169 && std::is_same_v<std::ranges::range_value_t<T>, byte_t>;
170
171 template <class T>
172 concept translate_to_fixed_sized_list_layout = std::ranges::input_range<T>
176
177 template <class T>
178 concept translate_to_variable_sized_binary_layout = std::ranges::input_range<T>
179 && std::ranges::input_range<ensured_range_value_t<T>>
182 && // tuples go to struct layout
183 // value type of inner must be char like ( char,
184 // byte, uint8)
186
187 template <class T>
188 concept translate_to_fixed_width_binary_layout = std::ranges::input_range<T>
192
193 template <class T>
194 concept translate_to_union_layout = std::ranges::input_range<T> &&
195 // value type must be a variant-like type
196 // *NOTE* we don't check for nullable here, as we want to handle
197 // nullable variants as in the arrow spec, the nulls are handled
198 // by the elements **in** the variant
200
201 template <translates_to_primitive_layout T, class OPTION_FLAGS>
202 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
203 {
205
206 template <class U>
207 [[nodiscard]] static type create(U&& t)
208 {
209 return type(std::forward<U>(t));
210 }
211 };
212
213 template <translates_to_date_layout T, class OPTION_FLAGS>
214 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
215 {
217
218 template <class U>
219 static type create(U&& t)
220 {
221 return type(std::forward<U>(t));
222 }
223 };
224
225 template <translates_to_duration_layout T, class OPTION_FLAGS>
226 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
227 {
229
230 template <class U>
231 [[nodiscard]] static type create(U&& t)
232 {
233 return type(std::forward<U>(t));
234 }
235 };
236
237 template <translates_to_timestamp_layout T, class OPTION_FLAGS>
238 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
239 {
241 using timezone_ptr = std::decay_t<decltype(std::declval<ensured_range_value_t<T>>().get_time_zone())>;
242
243 template <class U>
244 [[nodiscard]] static type create(U&& t)
245 {
246 timezone_ptr tz = [&t]() -> timezone_ptr
247 {
248 if (t.empty())
249 {
250 return nullptr;
251 }
252 else
253 {
254 return t.begin()->get_time_zone();
255 }
256 }();
257 return type(tz, std::forward<U>(t));
258 }
259 };
260
261 template <translates_to_interval_layout T, class OPTION_FLAGS>
262 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
263 {
265
266 template <class U>
267 [[nodiscard]] static type create(U&& t)
268 {
269 return type(std::forward<U>(t));
270 }
271 };
272
273 template <translates_to_time_layout T, class OPTION_FLAGS>
274 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
275 {
277
278 template <class U>
279 static type create(U&& t)
280 {
281 return type(std::forward<U>(t));
282 }
283 };
284
285 template <translate_to_variable_sized_list_layout T, class OPTION_FLAGS>
286 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
287 {
288 using raw_value_type = std::ranges::range_value_t<T>;
289
290 using type = std::conditional_t<
291 mpl::contains<large_list_flag_t>(OPTION_FLAGS{}),
294
295 template <class U>
296 [[nodiscard]] static type create(U&& t)
297 {
298 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
299
300 auto sizes = t
301 | std::views::transform(
302 [](const auto& l)
303 {
304 return get_size_save(l);
305 }
306 );
307
308 // when the raw_value_type is a "express layout desire" we need to
309 // propagate this information to the builder, so it can handle the
310 using layout_policy_type = layout_flag_t<raw_value_type>;
311 auto typed_array = build_impl<layout_policy_type>(flat_list_view, OPTION_FLAGS{});
312 auto detyped_array = array(std::move(typed_array));
313
314 return type(std::move(detyped_array), type::offset_from_sizes(sizes), where_null(t));
315 }
316 };
317
318 template <translate_to_fixed_sized_list_layout T, class OPTION_FLAGS>
319 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
320 {
322 static constexpr std::size_t
323 list_size = std::tuple_size_v<look_trough_t<std::ranges::range_value_t<T>>>;
324 using raw_value_type = std::ranges::range_value_t<T>;
325
326 template <class U>
327 [[nodiscard]] static type create(U&& t)
328 {
329 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
330
331 // when the raw_value_type is a "express layout desire" we need to
332 // propagate this information to the builder.
333 using layout_policy_type = layout_flag_t<raw_value_type>;
334
335 return type(
336 static_cast<std::uint64_t>(list_size),
337 array(build_impl<layout_policy_type>(flat_list_view, OPTION_FLAGS{})),
338 where_null(t)
339 );
340 }
341 };
342
343 template <translate_to_struct_layout T, class OPTION_FLAGS>
344 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
345 {
347 static constexpr std::size_t n_children = std::tuple_size_v<mnv_t<std::ranges::range_value_t<T>>>;
349
350 template <class U>
351 [[nodiscard]] static type create(U&& t)
352 {
353 std::vector<array> detyped_children(n_children);
355 [&](auto i)
356 {
357 auto tuple_i_col = t
358 | std::views::transform(
359 [](const auto& maybe_nullable_tuple)
360 {
361 const auto& tuple_val = ensure_value(maybe_nullable_tuple);
362 return std::get<decltype(i)::value>(tuple_val);
363 }
364 );
365
366 using tuple_element_type = std::tuple_element_t<decltype(i)::value, tuple_type>;
367 using layout_policy_type = layout_flag_t<tuple_element_type>;
368 detyped_children[decltype(i)::value] = array(
369 build_impl<layout_policy_type>(tuple_i_col, OPTION_FLAGS{})
370 );
371 }
372 );
373
374 return type(std::move(detyped_children), where_null(t));
375 }
376 };
377
378 template <translate_to_variable_sized_binary_layout T, class OPTION_FLAGS>
379 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
380 {
382
383 template <class U>
384 [[nodiscard]] static type create(U&& t)
385 {
386 auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
387 u8_buffer<char> data_buffer(flat_list_view);
388
389 auto sizes = t
390 | std::views::transform(
391 [](const auto& l)
392 {
393 return get_size_save(l);
394 }
395 );
396
397 return type(std::move(data_buffer), type::offset_from_sizes(sizes), where_null(t));
398 }
399 };
400
401 template <translate_to_fixed_width_binary_layout T, class OPTION_FLAGS>
402 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
403 {
405
406 template <class U>
407 [[nodiscard]] static type create(U&& t)
408 {
409 return type(std::move(t));
410 }
411 };
412
413 template <translate_to_union_layout T, class OPTION_FLAGS>
414 struct builder<T, dont_enforce_layout, OPTION_FLAGS>
415 {
416 using type = sparrow::sparse_union_array; // TODO use options to select between sparse and dense
417 using variant_type = std::ranges::range_value_t<T>;
418 static constexpr std::size_t variant_size = std::variant_size_v<variant_type>;
419
420 template <class U>
421 [[nodiscard]] static type create(U&& t)
422 requires(std::is_same_v<type, sparrow::sparse_union_array>)
423 {
424 std::vector<array> detyped_children(variant_size);
426 [&](auto i)
427 {
428 using type_at_index = std::variant_alternative_t<decltype(i)::value, variant_type>;
429 auto type_i_col = t
430 | std::views::transform(
431 [](const auto& variant)
432 {
433 return variant.index() == decltype(i)::value
434 ? std::get<type_at_index>(variant)
435 : type_at_index{};
436 }
437 );
438
439 using layout_policy_type = layout_flag_t<type_at_index>;
440 detyped_children[decltype(i)::value] = array(
441 build_impl<layout_policy_type>(type_i_col, OPTION_FLAGS{})
442 );
443 }
444 );
445
446 // type-ids
447 auto type_id_range = t
448 | std::views::transform(
449 [](const auto& v)
450 {
451 return static_cast<std::uint8_t>(v.index());
452 }
453 );
454 sparrow::u8_buffer<std::uint8_t> type_id_buffer(type_id_range);
455
456 return type(std::move(detyped_children), std::move(type_id_buffer));
457 }
458 };
459
460 template <class T, class OPTION_FLAGS>
461 struct builder<T, enforce_dict_encoded_layout, OPTION_FLAGS>
462 {
463 using key_type = std::uint64_t;
465 // keep the nulls
466 using raw_range_value_type = std::ranges::range_value_t<T>;
467
468 template <class U>
469 [[nodiscard]] static type create(U&& t)
470 {
471 const auto input_size = range_size(t);
472 key_type key = 0;
473 std::map<raw_range_value_type, key_type, nested_less<raw_range_value_type>> value_map;
474 std::vector<raw_range_value_type> values;
475 std::vector<key_type> keys;
476
477 values.reserve(input_size);
478 keys.reserve(input_size);
479
480 for (const auto& v : t)
481 {
482 auto find_res = value_map.find(v);
483 if (find_res == value_map.end())
484 {
485 value_map.insert({v, key});
486 values.push_back(v);
487 keys.push_back(key);
488 ++key;
489 }
490 else
491 {
492 keys.push_back(find_res->second);
493 }
494 }
495 auto keys_buffer = sparrow::u8_buffer<key_type>(keys);
496
497 // since we do not support dict[dict or dict[run_end
498 // we can hard code the layout policy here
499 using layout_policy_type = dont_enforce_layout;
500
501 auto values_array = build_impl<layout_policy_type>(values, OPTION_FLAGS{});
502
503 return type(std::move(keys_buffer), array(std::move(values_array)));
504 }
505 };
506
507 template <class T, class OPTION_FLAGS>
509 {
511 using raw_range_value_type = std::ranges::range_value_t<T>;
512
513 template <class U>
514 [[nodiscard]] static type create(U&& t)
515 {
516 using value_type = std::decay_t<raw_range_value_type>;
517
518 const auto input_size = range_size(t);
519
520 std::vector<value_type> values{};
521 std::vector<std::size_t> acc_run_lengths{};
522
523 values.reserve(input_size);
524 acc_run_lengths.reserve(input_size);
525
526 const auto eq = nested_eq<value_type>{};
527
528 // accumulate the run lengths
529 std::size_t i = 0;
530 for (const auto& v : t)
531 {
532 // first value
533 if (i == 0)
534 {
535 values.push_back(v);
536 }
537 // rest
538 else
539 {
540 if (!eq(values.back(), v))
541 {
542 acc_run_lengths.push_back(i);
543 values.push_back(v);
544 }
545 }
546 ++i;
547 }
548 acc_run_lengths.push_back(i);
549
550 auto run_length_typed_array = primitive_array<std::size_t>(acc_run_lengths);
551
552 // since we do not support dict[dict or dict[run_end
553 // we can hard code the layout policy here
554 using layout_policy_type = dont_enforce_layout;
555 auto values_array = build_impl<layout_policy_type>(values, OPTION_FLAGS{});
556
557 return type(array(std::move(run_length_typed_array)), array(std::move(values_array)));
558 }
559 };
560 } // namespace detail
561} // namespace sparrow
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:39
std::conditional_t< is_dict_encode< mnv_t< T > >, enforce_dict_encoded_layout, std::conditional_t< is_run_end_encode< mnv_t< T > >, enforce_run_end_encoded_layout, dont_enforce_layout > > layout_flag_t
look_trough_t< std::ranges::range_value_t< T > > ensured_range_value_t
ensured_range_value_t< ensured_range_value_t< T > > nested_ensured_range_inner_value_t
decltype(auto) ensure_value_range(T &&t)
std::vector< std::size_t > where_null(T &&t)
decltype(auto) ensure_value(T &&t)
void for_each_index(F &&f)
auto build_impl(T &&t, sparrow::mpl::typelist< OPTION_FLAGS... > typelist)
Definition builder.hpp:112
std::size_t get_size_save(T &&t)
consteval bool contains(L list)
Definition mp_utils.hpp:285
consteval bool any_of(L< T... >, Predicate predicate={})
Checks that at least one type in the provided list of is making the provide predicate return true.
Definition mp_utils.hpp:244
constexpr bool is_type_instance_of_v
true if T is a concrete type template instanciation of U which is a type template.
Definition mp_utils.hpp:50
array_trivial_copyable< T > date_array
Array of std::chrono::duration values.
std::byte byte_t
mpl:: typelist< chrono::time_seconds, chrono::time_milliseconds, chrono::time_microseconds, chrono::time_nanoseconds > time_types_t
array_trivial_copyable< T > duration_array
Array of std::chrono::duration values.
list_array_impl< false > list_array
array_trivial_copyable< T > time_array
Array of time values.
array_trivial_copyable< T > interval_array
Array of interval values.
auto build(T &&t, OPTION_FLAGS &&...)
function to create a sparrow array from arbitrary nested combinations of ranges, tuples,...
Definition builder.hpp:76
mpl::typelist< date_days, date_milliseconds > date_types_t
variable_size_binary_array_impl< std::string, std::string_view, std::int32_t > string_array
constexpr large_list_flag_t large_list_flag
Definition builder.hpp:68
date::zoned_time< Duration, TimeZonePtr > timestamp
fixed_width_binary_array_impl< fixed_width_binary_traits::value_type, fixed_width_binary_traits::const_reference > fixed_width_binary_array
list_array_impl< true > big_list_array
mpl::typelist< chrono::months, days_time_interval, month_day_nanoseconds_interval > interval_types_t
std::size_t range_size(R &&r)
Definition ranges.hpp:31
array_trivial_copyable< T > primitive_array
Array of values of whose type has fixed binary size.
mpl:: typelist< std::chrono::seconds, std::chrono::milliseconds, std::chrono::microseconds, std::chrono::nanoseconds > duration_types_t
sparrow::primitive_array< ensured_range_value_t< T > > type
Definition builder.hpp:204
std::decay_t< decltype(std::declval< ensured_range_value_t< T > >().get_time_zone())> timezone_ptr
Definition builder.hpp:241
Workaround to replace static_assert(false) in template code.
Definition mp_utils.hpp:33
Compile-time type predicate: true if the evaluated type is the same as T.
Definition mp_utils.hpp:207
A sequence of types, used for meta-programming operations.
Definition mp_utils.hpp:55