sparrow 0.3.0
Loading...
Searching...
No Matches
union_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include "sparrow/array_api.hpp"
29
30namespace sparrow
31{
34
35 namespace detail
36 {
37 template <class T>
39
40 template <>
42 {
43 [[nodiscard]] static constexpr sparrow::data_type get()
44 {
46 }
47 };
48
49 template <>
51 {
52 [[nodiscard]] static constexpr sparrow::data_type get()
53 {
55 }
56 };
57 }
58
62 template <class T>
63 constexpr bool is_dense_union_array_v = std::same_as<T, dense_union_array>;
64
68 template <class T>
69 constexpr bool is_sparse_union_array_v = std::same_as<T, sparse_union_array>;
70
71 // helper crtp-base to have sparse and dense and dense union share most of their code
72 template <class DERIVED>
73 class union_array_crtp_base : public crtp_base<DERIVED>
74 {
75 public:
76
78 using derived_type = DERIVED;
85 using const_reverse_iterator = std::reverse_iterator<const_iterator>;
86 using size_type = std::size_t;
87
89
90 [[nodiscard]] std::optional<std::string_view> name() const;
91 [[nodiscard]] std::optional<std::string_view> metadata() const;
92
93 [[nodiscard]] value_type at(size_type i) const;
94 [[nodiscard]] value_type operator[](size_type i) const;
96 [[nodiscard]] value_type front() const;
97 [[nodiscard]] value_type back() const;
98
99 [[nodiscard]] bool empty() const;
100 [[nodiscard]] size_type size() const;
101
102 [[nodiscard]] iterator begin();
103 [[nodiscard]] iterator end();
104 [[nodiscard]] const_iterator begin() const;
105 [[nodiscard]] const_iterator end() const;
106 [[nodiscard]] const_iterator cbegin() const;
107 [[nodiscard]] const_iterator cend() const;
108
109 [[nodiscard]] const_reverse_iterator rbegin() const;
110 [[nodiscard]] const_reverse_iterator rend() const;
111
112 [[nodiscard]] const_reverse_iterator crbegin() const;
113 [[nodiscard]] const_reverse_iterator crend() const;
114
115 protected:
116
117 using type_id_map = std::array<std::uint8_t, 256>;
118 static type_id_map parse_type_id_map(std::string_view format_string);
119
120 template <std::ranges::input_range R>
121 static type_id_map type_id_map_from_child_to_type_id(R&& child_index_to_type_id);
122
123 template <std::ranges::input_range R>
124 requires(std::convertible_to<std::ranges::range_value_t<R>, std::uint8_t>)
125 static std::string make_format_string(bool dense, std::size_t n, R&& child_index_to_type_id);
126
127 using children_type = std::vector<cloning_ptr<array_wrapper>>;
129
131
134
136 self_type& operator=(self_type&& rhs) = default;
137
138 [[nodiscard]] arrow_proxy& get_arrow_proxy();
139 [[nodiscard]] const arrow_proxy& get_arrow_proxy() const;
140
142 const std::uint8_t* p_type_ids;
144
145 // map from type-id to child-index
146 std::array<std::uint8_t, 256> m_type_id_map;
147
149
150#if defined(__cpp_lib_format)
151 friend struct std::formatter<DERIVED>;
152#endif
153 };
154
155 template <class D>
157
158 class dense_union_array : public union_array_crtp_base<dense_union_array>
159 {
160 public:
161
165
166 template <class... Args>
168 explicit dense_union_array(Args&&... args)
169 : dense_union_array(create_proxy(std::forward<Args>(args)...))
170 {
171 }
172
174
177
180
181 private:
182
183 using type_id_map = typename base_type::type_id_map;
184
185 template <std::ranges::input_range TYPE_MAPPING = std::vector<std::uint8_t>>
186 requires(std::convertible_to<std::ranges::range_value_t<TYPE_MAPPING>, std::uint8_t>)
187 static auto create_proxy(
188 std::vector<array>&& children,
189 type_id_buffer_type&& element_type,
190 offset_buffer_type&& offsets,
191 TYPE_MAPPING&& type_mapping = TYPE_MAPPING{},
192 std::optional<std::string_view> name = std::nullopt,
193 std::optional<std::string_view> metadata = std::nullopt
194 ) -> arrow_proxy;
195
196 SPARROW_API static auto create_proxy_impl(
197 std::vector<array>&& children,
198 type_id_buffer_type&& element_type,
199 offset_buffer_type&& offsets,
200 std::string&& format,
201 type_id_map&& tim,
202 std::optional<std::string_view> name = std::nullopt,
203 std::optional<std::string_view> metadata = std::nullopt
204 ) -> arrow_proxy;
205
206 SPARROW_API std::size_t element_offset(std::size_t i) const;
207
208 const std::int32_t* p_offsets;
210 };
211
212 class sparse_union_array : public union_array_crtp_base<sparse_union_array>
213 {
214 public:
215
218
219 template <class... Args>
221 explicit sparse_union_array(Args&&... args)
222 : sparse_union_array(create_proxy(std::forward<Args>(args)...))
223 {
224 }
225
227
228 private:
229
230 using type_id_map = typename base_type::type_id_map;
231
232 template <std::ranges::input_range TYPE_MAPPING = std::vector<std::uint8_t>>
233 requires(std::convertible_to<std::ranges::range_value_t<TYPE_MAPPING>, std::uint8_t>)
234 static auto create_proxy(
235 std::vector<array>&& children,
236 type_id_buffer_type&& element_type,
237 TYPE_MAPPING&& type_mapping = TYPE_MAPPING{}
238 ) -> arrow_proxy;
239
240 SPARROW_API static auto create_proxy_impl(
241 std::vector<array>&& children,
242 type_id_buffer_type&& element_type,
243 std::string&& format,
244 type_id_map&& tim
245 ) -> arrow_proxy;
246
247 SPARROW_API std::size_t element_offset(std::size_t i) const;
249 };
250
251 /****************************************
252 * union_array_crtp_base implementation *
253 ****************************************/
254
255 template <class DERIVED>
257 {
258 type_id_map ret;
259 // remove +du: / +su: prefix
260 format_string.remove_prefix(4);
261
262 constexpr std::string_view delim{","};
263 std::size_t child_index = 0;
264 std::ranges::for_each(
265 format_string | std::views::split(delim),
266 [&](const auto& s)
267 {
268 const auto as_int = std::atoi(std::string(s.begin(), s.end()).c_str());
269 ret[static_cast<std::size_t>(as_int)] = static_cast<std::uint8_t>(child_index);
270 ++child_index;
271 }
272 );
273 return ret;
274 }
275
276 template <class DERIVED>
277 template <std::ranges::input_range R>
279 -> type_id_map
280 {
281 const std::size_t n = std::ranges::size(child_index_to_type_id);
282 std::array<std::uint8_t, 256> ret;
283 if (n == 0)
284 {
285 for (std::size_t i = 0; i < 256; ++i)
286 {
287 ret[i] = static_cast<std::uint8_t>(i);
288 }
289 }
290 else
291 {
292 for (std::size_t i = 0; i < n; ++i)
293 {
294 ret[child_index_to_type_id[i]] = static_cast<std::uint8_t>(i);
295 }
296 }
297 return ret;
298 }
299
300 template <class DERIVED>
301 template <std::ranges::input_range R>
302 requires(std::convertible_to<std::ranges::range_value_t<R>, std::uint8_t>)
303 std::string union_array_crtp_base<DERIVED>::make_format_string(bool dense, const std::size_t n, R&& range)
304 {
305 const auto range_size = std::ranges::size(range);
306 if (range_size == n || range_size == 0)
307 {
308 std::string ret = dense ? "+ud:" : "+us:";
309 if (range_size == 0)
310 {
311 for (std::size_t i = 0; i < n; ++i)
312 {
313 ret += std::to_string(i) + ",";
314 }
315 }
316 else
317 {
318 for (const auto& v : range)
319 {
320 ret += std::to_string(v) + ",";
321 }
322 }
323 ret.pop_back();
324 return ret;
325 }
326 else
327 {
328 throw std::invalid_argument("Invalid type-id map");
329 }
330 }
331
332 template <class DERIVED>
333 std::optional<std::string_view> union_array_crtp_base<DERIVED>::name() const
334 {
335 return m_proxy.name();
336 }
337
338 template <class DERIVED>
339 std::optional<std::string_view> union_array_crtp_base<DERIVED>::metadata() const
340 {
341 return m_proxy.metadata();
342 }
343
344 template <class DERIVED>
349
350 template <class DERIVED>
355
356 template <class DERIVED>
358 : m_proxy(std::move(proxy))
359 , p_type_ids(reinterpret_cast<std::uint8_t*>(m_proxy.buffers()[0 /*index of type-ids*/].data()))
362 {
363 }
364
365 template <class DERIVED>
370
371 template <class DERIVED>
373 {
374 if (this != &rhs)
375 {
376 m_proxy = rhs.m_proxy;
377 p_type_ids = reinterpret_cast<std::uint8_t*>(m_proxy.buffers()[0 /*index of type-ids*/].data());
380 }
381 return *this;
382 }
383
384 template <class DERIVED>
386 {
387 const auto type_id = static_cast<std::size_t>(p_type_ids[i]);
388 const auto child_index = m_type_id_map[type_id];
389 const auto offset = this->derived_cast().element_offset(i);
390 return array_element(*m_children[child_index], static_cast<std::size_t>(offset));
391 }
392
393 template <class DERIVED>
395 {
396 return static_cast<const derived_type&>(*this)[i];
397 }
398
399 template <class DERIVED>
401 {
402 return m_proxy.length();
403 }
404
405 template <class DERIVED>
407 {
408 return size() == 0;
409 }
410
411 template <class DERIVED>
413 {
414 return iterator(functor_type{&(this->derived_cast())}, 0);
415 }
416
417 template <class DERIVED>
419 {
420 return iterator(functor_type{&(this->derived_cast())}, this->size());
421 }
422
423 template <class DERIVED>
425 {
426 return cbegin();
427 }
428
429 template <class DERIVED>
431 {
432 return cend();
433 }
434
435 template <class DERIVED>
440
441 template <class DERIVED>
443 {
444 return const_iterator(const_functor_type{&(this->derived_cast())}, this->size());
445 }
446
447 template <class DERIVED>
452
453 template <class DERIVED>
458
459 template <class DERIVED>
464
465 template <class DERIVED>
467 {
468 return rend();
469 }
470
471 template <class DERIVED>
473 {
474 return (*this)[0];
475 }
476
477 template <class DERIVED>
479 {
480 return (*this)[this->size() - 1];
481 }
482
483 template <class DERIVED>
485 {
486 children_type children(proxy.children().size(), nullptr);
487 for (std::size_t i = 0; i < children.size(); ++i)
488 {
489 children[i] = array_factory(proxy.children()[i].view());
490 }
491 return children;
492 }
493
494 template <class D>
496 {
497 return std::ranges::equal(lhs, rhs);
498 }
499
500 /************************************
501 * dense_union_array implementation *
502 ************************************/
503
504 template <std::ranges::input_range TYPE_MAPPING>
505 requires(std::convertible_to<std::ranges::range_value_t<TYPE_MAPPING>, std::uint8_t>)
506 auto dense_union_array::create_proxy(
507 std::vector<array>&& children,
508 type_id_buffer_type&& element_type,
509 offset_buffer_type&& offsets,
510 TYPE_MAPPING&& child_index_to_type_id,
511 std::optional<std::string_view> name,
512 std::optional<std::string_view> metadata
513 ) -> arrow_proxy
514 {
515 const auto n_children = children.size();
516
517 // inverse type mapping (type_id -> child_index)
518 auto type_id_to_child_index = type_id_map_from_child_to_type_id(child_index_to_type_id);
519
520 std::string format = make_format_string(
521 true /*dense union*/,
522 n_children,
523 std::forward<TYPE_MAPPING>(child_index_to_type_id)
524 );
525
526 return create_proxy_impl(
527 std::move(children),
528 std::move(element_type),
529 std::move(offsets),
530 std::move(format),
531 std::move(type_id_to_child_index),
532 std::move(name),
533 std::move(metadata)
534 );
535 }
536
537 /*************************************
538 * sparse_union_array implementation *
539 *************************************/
540
541 template <std::ranges::input_range TYPE_MAPPING>
542 requires(std::convertible_to<std::ranges::range_value_t<TYPE_MAPPING>, std::uint8_t>)
543 auto sparse_union_array::create_proxy(
544 std::vector<array>&& children,
545 type_id_buffer_type&& element_type,
546 TYPE_MAPPING&& child_index_to_type_id
547 ) -> arrow_proxy
548 {
549 const auto n_children = children.size();
550
551 // inverse type mapping (type_id -> child_index)
552 auto type_id_to_child_index = type_id_map_from_child_to_type_id(child_index_to_type_id);
553
554 std::string format = make_format_string(
555 false /*is dense union*/,
556 n_children,
557 std::forward<TYPE_MAPPING>(child_index_to_type_id)
558 );
559
560 return create_proxy_impl(
561 std::move(children),
562 std::move(element_type),
563 std::move(format),
564 std::move(type_id_to_child_index)
565 );
566 }
567}
568
569#if defined(__cpp_lib_format)
570
571template <typename U>
572 requires std::derived_from<U, sparrow::union_array_crtp_base<U>>
573struct std::formatter<U>
574{
575 constexpr auto parse(std::format_parse_context& ctx)
576 {
577 return ctx.begin(); // Simple implementation
578 }
579
580 auto format(const U& ar, std::format_context& ctx) const
581 {
582 if constexpr (std::is_same_v<U, sparrow::dense_union_array>)
583 {
584 std::format_to(ctx.out(), "DenseUnion");
585 }
586 else if constexpr (std::is_same_v<U, sparrow::sparse_union_array>)
587 {
588 std::format_to(ctx.out(), "SparseUnion");
589 }
590 else
591 {
592 static_assert(sparrow::mpl::dependent_false<U>::value, "Unknown union array type");
594 }
595 const auto& proxy = ar.get_arrow_proxy();
596 std::format_to(ctx.out(), " [name={} | size={}] <", proxy.name().value_or("nullptr"), proxy.length());
597
598 std::for_each(
599 ar.cbegin(),
600 std::prev(ar.cend()),
601 [&ctx](const auto& value)
602 {
603 std::format_to(ctx.out(), "{}, ", value);
604 }
605 );
606
607 return std::format_to(ctx.out(), "{}>", ar.back());
608 }
609};
610
611namespace sparrow
612{
613 template <typename U>
614 requires std::derived_from<U, sparrow::union_array_crtp_base<U>>
615 std::ostream& operator<<(std::ostream& os, const U& value)
616 {
617 os << std::format("{}", value);
618 return os;
619 }
620}
621
622#endif
void sparse_union_array()
Proxy class over ArrowArray and ArrowSchema.
Base class for CRTP base classes.
Definition crtp_base.hpp:29
derived_type & derived_cast()
Definition crtp_base.hpp:39
SPARROW_API dense_union_array & operator=(const dense_union_array &rhs)
u8_buffer< std::uint32_t > offset_buffer_type
union_array_crtp_base< dense_union_array > base_type
typename base_type::type_id_buffer_type type_id_buffer_type
dense_union_array(Args &&... args)
SPARROW_API dense_union_array(arrow_proxy proxy)
SPARROW_API dense_union_array(const dense_union_array &rhs)
dense_union_array(dense_union_array &&rhs)=default
dense_union_array & operator=(dense_union_array &&rhs)=default
sparse_union_array(Args &&... args)
union_array_crtp_base< sparse_union_array > base_type
typename base_type::type_id_buffer_type type_id_buffer_type
SPARROW_API sparse_union_array(arrow_proxy proxy)
array_traits::inner_value_type inner_value_type
std::reverse_iterator< const_iterator > const_reverse_iterator
std::array< std::uint8_t, 256 > type_id_map
const_iterator begin() const
self_type & operator=(self_type &&rhs)=default
union_array_crtp_base(const self_type &rhs)
static std::string make_format_string(bool dense, std::size_t n, R &&child_index_to_type_id)
detail::layout_bracket_functor< const derived_type, value_type > const_functor_type
union_array_crtp_base(self_type &&rhs)=default
array_traits::const_reference value_type
u8_buffer< std::uint8_t > type_id_buffer_type
const_reverse_iterator rend() const
self_type & operator=(const self_type &rhs)
const_iterator cbegin() const
union_array_crtp_base< DERIVED > self_type
const_iterator cend() const
union_array_crtp_base(arrow_proxy proxy)
value_type operator[](size_type i)
detail::layout_bracket_functor< derived_type, value_type > functor_type
const_reverse_iterator crbegin() const
const_reverse_iterator crend() const
static type_id_map parse_type_id_map(std::string_view format_string)
functor_index_iterator< const_functor_type > const_iterator
value_type operator[](size_type i) const
value_type at(size_type i) const
functor_index_iterator< functor_type > iterator
const_reverse_iterator rbegin() const
const arrow_proxy & get_arrow_proxy() const
std::optional< std::string_view > name() const
static type_id_map type_id_map_from_child_to_type_id(R &&child_index_to_type_id)
children_type make_children(arrow_proxy &proxy)
const_iterator end() const
std::optional< std::string_view > metadata() const
std::vector< cloning_ptr< array_wrapper > > children_type
#define SPARROW_API
Definition config.hpp:38
constexpr bool excludes_copy_and_move_ctor_v
Definition mp_utils.hpp:507
void unreachable()
Invokes undefined behavior.
Definition mp_utils.hpp:425
constexpr bool is_dense_union_array_v
Checks whether T is a dense_union_array type.
SPARROW_API bool operator==(const array &lhs, const array &rhs)
Compares the content of two arrays.
SPARROW_API cloning_ptr< array_wrapper > array_factory(arrow_proxy proxy)
SPARROW_API array_traits::const_reference array_element(const array_wrapper &ar, std::size_t index)
std::ostream & operator<<(std::ostream &stream, T n)
Definition large_int.hpp:93
constexpr bool is_sparse_union_array_v
Checks whether T is a sparse_union_array type.
std::size_t range_size(R &&r)
Definition ranges.hpp:31
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
mpl::rename< mpl::transform< detail::array_const_reference_t, all_base_types_t >, nullable_variant > const_reference
mpl::rename< all_base_types_t, std::variant > inner_value_type