sparrow 0.6.0
Loading...
Searching...
No Matches
union_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include "sparrow/array_api.hpp"
30
31namespace sparrow
32{
35
36 namespace detail
37 {
38 template <class T>
40
41 template <>
43 {
44 [[nodiscard]] static constexpr sparrow::data_type get()
45 {
47 }
48 };
49
50 template <>
52 {
53 [[nodiscard]] static constexpr sparrow::data_type get()
54 {
56 }
57 };
58 }
59
63 template <class T>
64 constexpr bool is_dense_union_array_v = std::same_as<T, dense_union_array>;
65
69 template <class T>
70 constexpr bool is_sparse_union_array_v = std::same_as<T, sparse_union_array>;
71
72 // helper crtp-base to have sparse and dense and dense union share most of their code
73 template <class DERIVED>
74 class union_array_crtp_base : public crtp_base<DERIVED>
75 {
76 public:
77
79 using derived_type = DERIVED;
86 using const_reverse_iterator = std::reverse_iterator<const_iterator>;
87 using size_type = std::size_t;
88
90
91 [[nodiscard]] std::optional<std::string_view> name() const;
92 [[nodiscard]] std::optional<key_value_view> metadata() const;
93
94 [[nodiscard]] value_type at(size_type i) const;
95 [[nodiscard]] value_type operator[](size_type i) const;
97 [[nodiscard]] value_type front() const;
98 [[nodiscard]] value_type back() const;
99
100 [[nodiscard]] bool empty() const;
101 [[nodiscard]] size_type size() const;
102
103 [[nodiscard]] iterator begin();
104 [[nodiscard]] iterator end();
105 [[nodiscard]] const_iterator begin() const;
106 [[nodiscard]] const_iterator end() const;
107 [[nodiscard]] const_iterator cbegin() const;
108 [[nodiscard]] const_iterator cend() const;
109
110 [[nodiscard]] const_reverse_iterator rbegin() const;
111 [[nodiscard]] const_reverse_iterator rend() const;
112
113 [[nodiscard]] const_reverse_iterator crbegin() const;
114 [[nodiscard]] const_reverse_iterator crend() const;
115
116 protected:
117
118 using type_id_map = std::array<std::uint8_t, 256>;
119 static type_id_map parse_type_id_map(std::string_view format_string);
120
121 template <std::ranges::input_range R>
122 static type_id_map type_id_map_from_child_to_type_id(R&& child_index_to_type_id);
123
124 template <std::ranges::input_range R>
125 requires(std::convertible_to<std::ranges::range_value_t<R>, std::uint8_t>)
126 static std::string make_format_string(bool dense, std::size_t n, R&& child_index_to_type_id);
127
128 using children_type = std::vector<cloning_ptr<array_wrapper>>;
130
132
135
137 self_type& operator=(self_type&& rhs) = default;
138
139 [[nodiscard]] arrow_proxy& get_arrow_proxy();
140 [[nodiscard]] const arrow_proxy& get_arrow_proxy() const;
141
143 const std::uint8_t* p_type_ids;
145
146 // map from type-id to child-index
147 std::array<std::uint8_t, 256> m_type_id_map;
148
150
151#if defined(__cpp_lib_format)
152 friend struct std::formatter<DERIVED>;
153#endif
154 };
155
156 template <class D>
158
159 class dense_union_array : public union_array_crtp_base<dense_union_array>
160 {
161 public:
162
166
167 template <class... Args>
169 explicit dense_union_array(Args&&... args)
170 : dense_union_array(create_proxy(std::forward<Args>(args)...))
171 {
172 }
173
175
178
181
182 private:
183
184 using type_id_map = typename base_type::type_id_map;
185
186 template <
187 std::ranges::input_range TYPE_MAPPING = std::vector<std::uint8_t>,
188 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
189 requires(std::convertible_to<std::ranges::range_value_t<TYPE_MAPPING>, std::uint8_t>)
190 static auto create_proxy(
191 std::vector<array>&& children,
192 type_id_buffer_type&& element_type,
193 offset_buffer_type&& offsets,
194 TYPE_MAPPING&& type_mapping = TYPE_MAPPING{},
195 std::optional<std::string_view> name = std::nullopt,
196 std::optional<METADATA_RANGE> metadata = std::nullopt
197 ) -> arrow_proxy;
198
199 template <input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
200 static auto create_proxy_impl(
201 std::vector<array>&& children,
202 type_id_buffer_type&& element_type,
203 offset_buffer_type&& offsets,
204 std::string&& format,
205 type_id_map&& tim,
206 std::optional<std::string_view> name = std::nullopt,
207 std::optional<METADATA_RANGE> metadata = std::nullopt
208 ) -> arrow_proxy;
209
210 SPARROW_API std::size_t element_offset(std::size_t i) const;
211
212 const std::int32_t* p_offsets;
214 };
215
216 class sparse_union_array : public union_array_crtp_base<sparse_union_array>
217 {
218 public:
219
222
223 template <class... Args>
225 explicit sparse_union_array(Args&&... args)
226 : sparse_union_array(create_proxy(std::forward<Args>(args)...))
227 {
228 }
229
231
232 private:
233
234 using type_id_map = typename base_type::type_id_map;
235
236 template <std::ranges::input_range TYPE_MAPPING = std::vector<std::uint8_t>>
237 requires(std::convertible_to<std::ranges::range_value_t<TYPE_MAPPING>, std::uint8_t>)
238 static auto create_proxy(
239 std::vector<array>&& children,
240 type_id_buffer_type&& element_type,
241 TYPE_MAPPING&& type_mapping = TYPE_MAPPING{}
242 ) -> arrow_proxy;
243
244 SPARROW_API static auto create_proxy_impl(
245 std::vector<array>&& children,
246 type_id_buffer_type&& element_type,
247 std::string&& format,
248 type_id_map&& tim
249 ) -> arrow_proxy;
250
251 SPARROW_API std::size_t element_offset(std::size_t i) const;
253 };
254
255 /****************************************
256 * union_array_crtp_base implementation *
257 ****************************************/
258
259 template <class DERIVED>
261 {
262 type_id_map ret;
263 // remove +du: / +su: prefix
264 format_string.remove_prefix(4);
265
266 constexpr std::string_view delim{","};
267 std::size_t child_index = 0;
268 std::ranges::for_each(
269 format_string | std::views::split(delim),
270 [&](const auto& s)
271 {
272 const std::string str(
273 std::string_view{&*std::ranges::begin(s), static_cast<size_t>(std::ranges::distance(s))}
274 );
275 const auto as_int = std::atoi(str.c_str());
276 ret[static_cast<std::size_t>(as_int)] = static_cast<std::uint8_t>(child_index);
277 ++child_index;
278 }
279 );
280 return ret;
281 }
282
283 template <class DERIVED>
284 template <std::ranges::input_range R>
286 -> type_id_map
287 {
288 const std::size_t n = std::ranges::size(child_index_to_type_id);
289 std::array<std::uint8_t, 256> ret;
290 if (n == 0)
291 {
292 for (std::size_t i = 0; i < 256; ++i)
293 {
294 ret[i] = static_cast<std::uint8_t>(i);
295 }
296 }
297 else
298 {
299 for (std::size_t i = 0; i < n; ++i)
300 {
301 ret[child_index_to_type_id[i]] = static_cast<std::uint8_t>(i);
302 }
303 }
304 return ret;
305 }
306
307 template <class DERIVED>
308 template <std::ranges::input_range R>
309 requires(std::convertible_to<std::ranges::range_value_t<R>, std::uint8_t>)
310 std::string union_array_crtp_base<DERIVED>::make_format_string(bool dense, const std::size_t n, R&& range)
311 {
312 const auto range_size = std::ranges::size(range);
313 if (range_size == n || range_size == 0)
314 {
315 std::string ret = dense ? "+ud:" : "+us:";
316 if (range_size == 0)
317 {
318 for (std::size_t i = 0; i < n; ++i)
319 {
320 ret += std::to_string(i) + ",";
321 }
322 }
323 else
324 {
325 for (const auto& v : range)
326 {
327 ret += std::to_string(v) + ",";
328 }
329 }
330 ret.pop_back();
331 return ret;
332 }
333 else
334 {
335 throw std::invalid_argument("Invalid type-id map");
336 }
337 }
338
339 template <class DERIVED>
340 std::optional<std::string_view> union_array_crtp_base<DERIVED>::name() const
341 {
342 return m_proxy.name();
343 }
344
345 template <class DERIVED>
346 std::optional<key_value_view> union_array_crtp_base<DERIVED>::metadata() const
347 {
348 return m_proxy.metadata();
349 }
350
351 template <class DERIVED>
356
357 template <class DERIVED>
362
363 template <class DERIVED>
365 : m_proxy(std::move(proxy))
366 , p_type_ids(reinterpret_cast<std::uint8_t*>(m_proxy.buffers()[0 /*index of type-ids*/].data()))
369 {
370 }
371
372 template <class DERIVED>
377
378 template <class DERIVED>
380 {
381 if (this != &rhs)
382 {
383 m_proxy = rhs.m_proxy;
384 p_type_ids = reinterpret_cast<std::uint8_t*>(m_proxy.buffers()[0 /*index of type-ids*/].data());
387 }
388 return *this;
389 }
390
391 template <class DERIVED>
393 {
394 const auto type_id = static_cast<std::size_t>(p_type_ids[i]);
395 const auto child_index = m_type_id_map[type_id];
396 const auto offset = this->derived_cast().element_offset(i);
397 return array_element(*m_children[child_index], static_cast<std::size_t>(offset));
398 }
399
400 template <class DERIVED>
402 {
403 return static_cast<const derived_type&>(*this)[i];
404 }
405
406 template <class DERIVED>
408 {
409 return m_proxy.length();
410 }
411
412 template <class DERIVED>
414 {
415 return size() == 0;
416 }
417
418 template <class DERIVED>
420 {
421 return iterator(functor_type{&(this->derived_cast())}, 0);
422 }
423
424 template <class DERIVED>
426 {
427 return iterator(functor_type{&(this->derived_cast())}, this->size());
428 }
429
430 template <class DERIVED>
432 {
433 return cbegin();
434 }
435
436 template <class DERIVED>
438 {
439 return cend();
440 }
441
442 template <class DERIVED>
447
448 template <class DERIVED>
450 {
451 return const_iterator(const_functor_type{&(this->derived_cast())}, this->size());
452 }
453
454 template <class DERIVED>
459
460 template <class DERIVED>
465
466 template <class DERIVED>
471
472 template <class DERIVED>
474 {
475 return rend();
476 }
477
478 template <class DERIVED>
480 {
481 return (*this)[0];
482 }
483
484 template <class DERIVED>
486 {
487 return (*this)[this->size() - 1];
488 }
489
490 template <class DERIVED>
492 {
493 children_type children(proxy.children().size(), nullptr);
494 for (std::size_t i = 0; i < children.size(); ++i)
495 {
496 children[i] = array_factory(proxy.children()[i].view());
497 }
498 return children;
499 }
500
501 template <class D>
503 {
504 return std::ranges::equal(lhs, rhs);
505 }
506
507 /************************************
508 * dense_union_array implementation *
509 ************************************/
510
511 template <std::ranges::input_range TYPE_MAPPING, input_metadata_container METADATA_RANGE>
512 requires(std::convertible_to<std::ranges::range_value_t<TYPE_MAPPING>, std::uint8_t>)
513 auto dense_union_array::create_proxy(
514 std::vector<array>&& children,
515 type_id_buffer_type&& element_type,
516 offset_buffer_type&& offsets,
517 TYPE_MAPPING&& child_index_to_type_id,
518 std::optional<std::string_view> name,
519 std::optional<METADATA_RANGE> metadata
520 ) -> arrow_proxy
521 {
522 const auto n_children = children.size();
523
524 // inverse type mapping (type_id -> child_index)
525 auto type_id_to_child_index = type_id_map_from_child_to_type_id(child_index_to_type_id);
526
527 std::string format = make_format_string(
528 true /*dense union*/,
529 n_children,
530 std::forward<TYPE_MAPPING>(child_index_to_type_id)
531 );
532
533 return create_proxy_impl(
534 std::move(children),
535 std::move(element_type),
536 std::move(offsets),
537 std::move(format),
538 std::move(type_id_to_child_index),
539 std::move(name),
540 std::move(metadata)
541 );
542 }
543
544 template <input_metadata_container METADATA_RANGE>
545 auto dense_union_array::create_proxy_impl(
546 std::vector<array>&& children,
547 type_id_buffer_type&& element_type,
548 offset_buffer_type&& offsets,
549 std::string&& format,
550 type_id_map&& tim,
551 std::optional<std::string_view> name,
552 std::optional<METADATA_RANGE> metadata
553 ) -> arrow_proxy
554 {
555 const auto n_children = children.size();
556 ArrowSchema** child_schemas = new ArrowSchema*[n_children];
557 ArrowArray** child_arrays = new ArrowArray*[n_children];
558 const auto size = element_type.size();
559
560 // count nulls (expensive!)
561 int64_t null_count = 0;
562 for (std::size_t i = 0; i < size; ++i)
563 {
564 // child_id from type_id
565 const auto type_id = static_cast<std::uint8_t>(element_type[i]);
566 const auto child_index = tim[type_id];
567 const auto offset = static_cast<std::size_t>(offsets[i]);
568 // check if child is null
569 if (!children[child_index][offset].has_value())
570 {
571 ++null_count;
572 }
573 }
574
575 for (std::size_t i = 0; i < n_children; ++i)
576 {
577 auto& child = children[i];
578 auto [flat_arr, flat_schema] = extract_arrow_structures(std::move(child));
579 child_arrays[i] = new ArrowArray(std::move(flat_arr));
580 child_schemas[i] = new ArrowSchema(std::move(flat_schema));
581 }
582
583 ArrowSchema schema = make_arrow_schema(
584 std::move(format),
585 std::move(name), // name
586 std::move(metadata), // metadata
587 std::nullopt, // flags,
588 child_schemas, // children
589 repeat_view<bool>(true, n_children), // children_ownership
590 nullptr, // dictionary,
591 true // dictionary ownership
592 );
593
594 std::vector<buffer<std::uint8_t>> arr_buffs = {
595 std::move(element_type).extract_storage(),
596 std::move(offsets).extract_storage()
597 };
598
599 ArrowArray arr = make_arrow_array(
600 static_cast<std::int64_t>(size), // length
601 static_cast<std::int64_t>(null_count),
602 0, // offset
603 std::move(arr_buffs),
604 child_arrays, // children
605 repeat_view<bool>(true, n_children), // children_ownership
606 nullptr, // dictionary,
607 true
608 );
609 return arrow_proxy{std::move(arr), std::move(schema)};
610 }
611
612 /*************************************
613 * sparse_union_array implementation *
614 *************************************/
615
616 template <std::ranges::input_range TYPE_MAPPING>
617 requires(std::convertible_to<std::ranges::range_value_t<TYPE_MAPPING>, std::uint8_t>)
618 auto sparse_union_array::create_proxy(
619 std::vector<array>&& children,
620 type_id_buffer_type&& element_type,
621 TYPE_MAPPING&& child_index_to_type_id
622 ) -> arrow_proxy
623 {
624 const auto n_children = children.size();
625
626 // inverse type mapping (type_id -> child_index)
627 auto type_id_to_child_index = type_id_map_from_child_to_type_id(child_index_to_type_id);
628
629 std::string format = make_format_string(
630 false /*is dense union*/,
631 n_children,
632 std::forward<TYPE_MAPPING>(child_index_to_type_id)
633 );
634
635 return create_proxy_impl(
636 std::move(children),
637 std::move(element_type),
638 std::move(format),
639 std::move(type_id_to_child_index)
640 );
641 }
642}
643
644#if defined(__cpp_lib_format)
645
646template <typename U>
647 requires std::derived_from<U, sparrow::union_array_crtp_base<U>>
648struct std::formatter<U>
649{
650 constexpr auto parse(std::format_parse_context& ctx)
651 {
652 return ctx.begin(); // Simple implementation
653 }
654
655 auto format(const U& ar, std::format_context& ctx) const
656 {
657 if constexpr (std::is_same_v<U, sparrow::dense_union_array>)
658 {
659 std::format_to(ctx.out(), "DenseUnion");
660 }
661 else if constexpr (std::is_same_v<U, sparrow::sparse_union_array>)
662 {
663 std::format_to(ctx.out(), "SparseUnion");
664 }
665 else
666 {
667 static_assert(sparrow::mpl::dependent_false<U>::value, "Unknown union array type");
669 }
670 const auto& proxy = ar.get_arrow_proxy();
671 std::format_to(ctx.out(), " [name={} | size={}] <", proxy.name().value_or("nullptr"), proxy.length());
672
673 std::for_each(
674 ar.cbegin(),
675 std::prev(ar.cend()),
676 [&ctx](const auto& value)
677 {
678 std::format_to(ctx.out(), "{}, ", value);
679 }
680 );
681
682 return std::format_to(ctx.out(), "{}>", ar.back());
683 }
684};
685
686namespace sparrow
687{
688 template <typename U>
689 requires std::derived_from<U, sparrow::union_array_crtp_base<U>>
690 std::ostream& operator<<(std::ostream& os, const U& value)
691 {
692 os << std::format("{}", value);
693 return os;
694 }
695}
696
697#endif
void sparse_union_array()
Proxy class over ArrowArray and ArrowSchema.
Base class for CRTP base classes.
Definition crtp_base.hpp:29
derived_type & derived_cast()
Definition crtp_base.hpp:39
SPARROW_API dense_union_array & operator=(const dense_union_array &rhs)
u8_buffer< std::uint32_t > offset_buffer_type
union_array_crtp_base< dense_union_array > base_type
typename base_type::type_id_buffer_type type_id_buffer_type
dense_union_array(Args &&... args)
SPARROW_API dense_union_array(arrow_proxy proxy)
SPARROW_API dense_union_array(const dense_union_array &rhs)
dense_union_array(dense_union_array &&rhs)=default
dense_union_array & operator=(dense_union_array &&rhs)=default
A view that repeats a value a given number of times.
sparse_union_array(Args &&... args)
union_array_crtp_base< sparse_union_array > base_type
typename base_type::type_id_buffer_type type_id_buffer_type
SPARROW_API sparse_union_array(arrow_proxy proxy)
This buffer class is use as storage buffer for all sparrow arrays.
Definition u8_buffer.hpp:75
array_traits::inner_value_type inner_value_type
std::reverse_iterator< const_iterator > const_reverse_iterator
std::array< std::uint8_t, 256 > type_id_map
const_iterator begin() const
self_type & operator=(self_type &&rhs)=default
union_array_crtp_base(const self_type &rhs)
static std::string make_format_string(bool dense, std::size_t n, R &&child_index_to_type_id)
detail::layout_bracket_functor< const derived_type, value_type > const_functor_type
union_array_crtp_base(self_type &&rhs)=default
array_traits::const_reference value_type
u8_buffer< std::uint8_t > type_id_buffer_type
const_reverse_iterator rend() const
self_type & operator=(const self_type &rhs)
const_iterator cbegin() const
union_array_crtp_base< DERIVED > self_type
const_iterator cend() const
union_array_crtp_base(arrow_proxy proxy)
value_type operator[](size_type i)
detail::layout_bracket_functor< derived_type, value_type > functor_type
const_reverse_iterator crbegin() const
const_reverse_iterator crend() const
static type_id_map parse_type_id_map(std::string_view format_string)
functor_index_iterator< const_functor_type > const_iterator
value_type operator[](size_type i) const
value_type at(size_type i) const
functor_index_iterator< functor_type > iterator
const_reverse_iterator rbegin() const
const arrow_proxy & get_arrow_proxy() const
std::optional< key_value_view > metadata() const
std::optional< std::string_view > name() const
static type_id_map type_id_map_from_child_to_type_id(R &&child_index_to_type_id)
children_type make_children(arrow_proxy &proxy)
const_iterator end() const
std::vector< cloning_ptr< array_wrapper > > children_type
#define SPARROW_API
Definition config.hpp:38
constexpr std::size_t size(typelist< T... >={})
Definition mp_utils.hpp:107
constexpr bool excludes_copy_and_move_ctor_v
Definition mp_utils.hpp:507
void unreachable()
Invokes undefined behavior.
Definition mp_utils.hpp:425
constexpr bool is_dense_union_array_v
Checks whether T is a dense_union_array type.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
SPARROW_API bool operator==(const array &lhs, const array &rhs)
Compares the content of two arrays.
std::pair< ArrowArray, ArrowSchema > extract_arrow_structures(A &&a)
Extracts the internal ArrowArrays and ArrowSchema structures from the given array or typed layout.
Definition array.hpp:91
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
SPARROW_API cloning_ptr< array_wrapper > array_factory(arrow_proxy proxy)
SPARROW_API array_traits::const_reference array_element(const array_wrapper &ar, std::size_t index)
std::ostream & operator<<(std::ostream &stream, T n)
Definition large_int.hpp:93
constexpr bool is_sparse_union_array_v
Checks whether T is a sparse_union_array type.
std::size_t range_size(R &&r)
Definition ranges.hpp:33
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
mpl::rename< mpl::transform< detail::array_const_reference_t, all_base_types_t >, nullable_variant > const_reference
mpl::rename< all_base_types_t, std::variant > inner_value_type