sparrow 0.9.0
Loading...
Searching...
No Matches
dictionary_encoded_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <optional>
18
19#include "sparrow/array_api.hpp"
35
36namespace sparrow
37{
38 template <class Layout, bool is_const>
40 {
41 public:
42
43 using layout_type = Layout;
44 using storage_type = std::conditional_t<is_const, const layout_type*, layout_type>;
45 using return_type = std::
46 conditional_t<is_const, typename layout_type::const_reference, typename layout_type::reference>;
47
48 constexpr layout_element_functor() = default;
49
50 constexpr explicit layout_element_functor(storage_type layout_)
51 : p_layout(layout_)
52 {
53 }
54
55 [[nodiscard]] return_type operator()(std::size_t i) const
56 {
57 return p_layout->operator[](i);
58 }
59
60 private:
61
62 storage_type p_layout;
63 };
64
65 template <std::integral IT>
66 class dictionary_encoded_array;
67
68 namespace detail
69 {
70 template <class T>
71 struct get_data_type_from_array;
72
73 template <std::integral IT>
75 {
76 [[nodiscard]] static constexpr sparrow::data_type get()
77 {
79 }
80 };
81
82 template <std::integral IT>
84 {
85 [[nodiscard]] static constexpr bool get()
86 {
87 return true;
88 }
89 };
90 }
91
95 template <class T>
97
98 template <std::integral IT>
100 {
101 public:
102
104 using size_type = std::size_t;
105 using difference_type = std::ptrdiff_t;
106
108
112
115
118
120
122
125
128
129 [[nodiscard]] std::optional<std::string_view> name() const;
130 [[nodiscard]] std::optional<key_value_view> metadata() const;
131
132 [[nodiscard]] size_type size() const;
133 [[nodiscard]] bool empty() const;
134
135 [[nodiscard]] const_reference operator[](size_type i) const;
136
137 [[nodiscard]] iterator begin();
138 [[nodiscard]] iterator end();
139
140 [[nodiscard]] const_iterator begin() const;
141 [[nodiscard]] const_iterator end() const;
142
143 [[nodiscard]] const_iterator cbegin() const;
144 [[nodiscard]] const_iterator cend() const;
145
146 [[nodiscard]] const_reference front() const;
147 [[nodiscard]] const_reference back() const;
148
149 template <class... Args>
151 explicit dictionary_encoded_array(Args&&... args)
152 : dictionary_encoded_array(create_proxy(std::forward<Args>(args)...))
153 {
154 }
155
165 [[nodiscard]] self_type slice(size_type start, size_type end) const;
166
176 [[nodiscard]] self_type slice_view(size_type start, size_type end) const;
177
178 private:
179
180 template <
182 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
183 [[nodiscard]] static auto create_proxy(
184 keys_buffer_type&& keys,
185 array&& values,
186 R&& bitmaps,
187 std::optional<std::string_view> name = std::nullopt,
188 std::optional<METADATA_RANGE> metadata = std::nullopt
189 ) -> arrow_proxy;
190
191 template <
193 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
194 [[nodiscard]] static auto create_proxy(
195 keys_buffer_type&& keys,
196 array&& values,
197 bool nullable = true,
198 std::optional<std::string_view> name = std::nullopt,
199 std::optional<METADATA_RANGE> metadata = std::nullopt
200 ) -> arrow_proxy;
201
202 template <input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
203 [[nodiscard]] static auto create_proxy_impl(
204 keys_buffer_type&& keys,
205 array&& values,
206 std::optional<validity_bitmap> validity = std::nullopt,
207 std::optional<std::string_view> name = std::nullopt,
208 std::optional<METADATA_RANGE> metadata = std::nullopt
209 ) -> arrow_proxy;
210
211 template <
212 std::ranges::input_range KEY_RANGE,
214 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
215 requires(
216 !std::same_as<KEY_RANGE, keys_buffer_type>
217 and std::same_as<IT, std::ranges::range_value_t<KEY_RANGE>>
218 )
219 [[nodiscard]] static arrow_proxy create_proxy(
220 KEY_RANGE&& keys,
221 array&& values,
222 R&& bitmaps = validity_bitmap{},
223 std::optional<std::string_view> name = std::nullopt,
224 std::optional<METADATA_RANGE> metadata = std::nullopt
225 )
226 {
227 keys_buffer_type keys_buffer(std::forward<KEY_RANGE>(keys));
228 return create_proxy(
229 std::move(keys_buffer),
230 std::forward<array>(values),
231 std::forward<R>(bitmaps),
232 std::move(name),
233 std::move(metadata)
234 );
235 }
236
237 // range of nullable values
238 template <
239 std::ranges::input_range NULLABLE_KEY_RANGE,
240 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
241 requires std::is_same_v<std::ranges::range_value_t<NULLABLE_KEY_RANGE>, nullable<IT>>
242 static arrow_proxy create_proxy(
243 NULLABLE_KEY_RANGE&& nullable_keys,
244 array&& values,
245 std::optional<std::string_view> name = std::nullopt,
246 std::optional<METADATA_RANGE> metadata = std::nullopt
247 );
248
249 using keys_layout = primitive_array<IT>;
250 using values_layout = cloning_ptr<array_wrapper>;
251
252 [[nodiscard]] const inner_value_type& dummy_inner_value() const;
253 [[nodiscard]] const_reference dummy_const_reference() const;
254
255 [[nodiscard]] static keys_layout create_keys_layout(arrow_proxy& proxy);
256 [[nodiscard]] static values_layout create_values_layout(arrow_proxy& proxy);
257
258 [[nodiscard]] arrow_proxy& get_arrow_proxy();
259 [[nodiscard]] const arrow_proxy& get_arrow_proxy() const;
260
261 arrow_proxy m_proxy;
262 keys_layout m_keys_layout;
263 values_layout p_values_layout;
264
266 };
267
268 template <class IT>
270
271 /*******************************************
272 * dictionary_encoded_array implementation *
273 *******************************************/
274
275 template <std::integral IT>
277 : m_proxy(std::move(proxy))
278 , m_keys_layout(create_keys_layout(m_proxy))
279 , p_values_layout(create_values_layout(m_proxy))
280 {
281 SPARROW_ASSERT_TRUE(data_type_is_integer(m_proxy.data_type()));
282 }
283
284 template <std::integral IT>
286 : m_proxy(rhs.m_proxy)
287 , m_keys_layout(create_keys_layout(m_proxy))
288 , p_values_layout(create_values_layout(m_proxy))
289 {
290 }
291
292 template <std::integral IT>
294 {
295 if (this != &rhs)
296 {
297 m_proxy = rhs.m_proxy;
298 m_keys_layout = create_keys_layout(m_proxy);
299 p_values_layout = create_values_layout(m_proxy);
300 }
301 return *this;
302 }
303
304 template <std::integral IT>
306 : m_proxy(std::move(rhs.m_proxy))
307 , m_keys_layout(create_keys_layout(m_proxy))
308 , p_values_layout(create_values_layout(m_proxy))
309 {
310 }
311
312 template <std::integral IT>
314 {
315 if (this != &rhs)
316 {
317 using std::swap;
318 swap(m_proxy, rhs.m_proxy);
319 m_keys_layout = create_keys_layout(m_proxy);
320 p_values_layout = create_values_layout(m_proxy);
321 }
322 return *this;
323 }
324
325 template <std::integral IT>
326 template <validity_bitmap_input VBI, input_metadata_container METADATA_RANGE>
327 auto dictionary_encoded_array<IT>::create_proxy(
328 keys_buffer_type&& keys,
329 array&& values,
330 VBI&& validity_input,
331 std::optional<std::string_view> name,
332 std::optional<METADATA_RANGE> metadata
333 ) -> arrow_proxy
334 {
335 const auto size = keys.size();
336 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VBI>(validity_input));
337 return create_proxy_impl(
338 std::forward<keys_buffer_type>(keys),
339 std::forward<array>(values),
340 std::make_optional<validity_bitmap>(std::move(vbitmap)),
341 std::move(name),
342 std::move(metadata)
343 );
344 }
345
346 template <std::integral IT>
347 template <validity_bitmap_input VBI, input_metadata_container METADATA_RANGE>
348 auto dictionary_encoded_array<IT>::create_proxy(
349 keys_buffer_type&& keys,
350 array&& values,
351 bool nullable,
352 std::optional<std::string_view> name,
353 std::optional<METADATA_RANGE> metadata
354 ) -> arrow_proxy
355 {
356 const auto size = keys.size();
357 return create_proxy_impl(
358 std::forward<keys_buffer_type>(keys),
359 std::forward<array>(values),
360 nullable ? std::make_optional<validity_bitmap>(nullptr, size) : std::nullopt,
361 std::move(name),
362 std::move(metadata)
363 );
364 }
365
366 template <std::integral IT>
367 template <input_metadata_container METADATA_RANGE>
368 [[nodiscard]] arrow_proxy dictionary_encoded_array<IT>::create_proxy_impl(
369 keys_buffer_type&& keys,
370 array&& values,
371 std::optional<validity_bitmap> validity,
372 std::optional<std::string_view> name,
373 std::optional<METADATA_RANGE> metadata
374 )
375 {
376 const auto size = keys.size();
377 auto [value_array, value_schema] = extract_arrow_structures(std::move(values));
378 static const repeat_view<bool> children_ownership{true, 0};
379
380 const std::optional<std::unordered_set<sparrow::ArrowFlag>>
381 flags = validity.has_value()
382 ? std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE})
383 : std::nullopt;
384
385 // create arrow schema and array
388 std::move(name), // name
389 std::move(metadata), // metadata
390 flags, // flags
391 nullptr, // children
392 children_ownership, // children_ownership
393 new ArrowSchema(std::move(value_schema)), // dictionary
394 true // dictionary ownership
395 );
396
397 const size_t null_count = validity.has_value() ? validity->null_count() : 0;
398
399 std::vector<buffer<uint8_t>> buffers(2);
400 buffers[0] = validity.has_value() ? std::move(*validity).extract_storage()
401 : buffer<uint8_t>{nullptr, 0};
402 buffers[1] = std::move(keys).extract_storage();
403 // create arrow array
405 static_cast<std::int64_t>(size), // length
406 static_cast<std::int64_t>(null_count), // Null count
407 0, // offset
408 std::move(buffers),
409 nullptr, // children
410 children_ownership, // children_ownership
411 new ArrowArray(std::move(value_array)), // dictionary
412 true // dictionary ownership
413 );
414 return arrow_proxy(std::move(arr), std::move(schema));
415 }
416
417 template <std::integral IT>
418 template <std::ranges::input_range NULLABLE_KEY_RANGE, input_metadata_container METADATA_RANGE>
419 requires std::is_same_v<std::ranges::range_value_t<NULLABLE_KEY_RANGE>, nullable<IT>>
420 arrow_proxy dictionary_encoded_array<IT>::create_proxy(
421 NULLABLE_KEY_RANGE&& nullable_keys,
422 array&& values,
423 std::optional<std::string_view> name,
424 std::optional<METADATA_RANGE> metadata
425 )
426 {
427 auto keys = nullable_keys
428 | std::views::transform(
429 [](const auto& v)
430 {
431 return v.get();
432 }
433 );
434 auto is_non_null = nullable_keys
435 | std::views::transform(
436 [](const auto& v)
437 {
438 return v.has_value();
439 }
440 );
441 return create_proxy(
442 std::move(keys),
443 std::forward<array>(values),
444 std::move(is_non_null),
445 std::move(name),
446 std::move(metadata)
447 );
448 }
449
450 template <std::integral IT>
451 std::optional<std::string_view> dictionary_encoded_array<IT>::name() const
452 {
453 return m_proxy.name();
454 }
455
456 template <std::integral IT>
457 std::optional<key_value_view> dictionary_encoded_array<IT>::metadata() const
458 {
459 return m_proxy.metadata();
460 }
461
462 template <std::integral IT>
464 {
465 return m_proxy.length();
466 }
467
468 template <std::integral IT>
470 {
471 return size() == 0;
472 }
473
474 template <std::integral IT>
476 {
478 const auto index = m_keys_layout[i];
479
480 if (index.has_value())
481 {
482 SPARROW_ASSERT_TRUE(index.value() >= 0);
483 return array_element(*p_values_layout, static_cast<std::size_t>(index.value()));
484 }
485 else
486 {
487 return dummy_const_reference();
488 }
489 }
490
491 template <std::integral IT>
493 {
494 return iterator(functor_type(this), 0u);
495 }
496
497 template <std::integral IT>
499 {
500 return iterator(functor_type(this), size());
501 }
502
503 template <std::integral IT>
505 {
506 return cbegin();
507 }
508
509 template <std::integral IT>
511 {
512 return cend();
513 }
514
515 template <std::integral IT>
520
521 template <std::integral IT>
526
527 template <std::integral IT>
533
534 template <std::integral IT>
536 {
538 return operator[](size() - 1);
539 }
540
541 template <std::integral IT>
542 auto dictionary_encoded_array<IT>::dummy_inner_value() const -> const inner_value_type&
543 {
544 static const inner_value_type instance = array_default_element_value(*p_values_layout);
545 return instance;
546 }
547
548 template <std::integral IT>
550 {
551 SPARROW_ASSERT_TRUE(start <= end);
552 return self_type{get_arrow_proxy().slice(start, end)};
553 }
554
555 template <std::integral IT>
557 {
558 SPARROW_ASSERT_TRUE(start <= end);
559 return self_type{get_arrow_proxy().slice_view(start, end)};
560 }
561
562 /*template <std::integral IT>
563 auto dictionary_encoded_array<IT>::dummy_inner_const_reference() const -> inner_const_reference
564 {
565 static const inner_const_reference instance =
566 std::visit([](const auto& val) -> inner_const_reference { return val; }, dummy_inner_value());
567 return instance;
568 }*/
569
570 template <std::integral IT>
571 auto dictionary_encoded_array<IT>::dummy_const_reference() const -> const_reference
572 {
573 static const const_reference instance = std::visit(
574 [](const auto& val) -> const_reference
575 {
576 using inner_ref = typename arrow_traits<std::decay_t<decltype(val)>>::const_reference;
577 return const_reference{nullable<inner_ref>(inner_ref(val), false)};
578 },
579 dummy_inner_value()
580 );
581 return instance;
582 }
583
584 template <std::integral IT>
585 typename dictionary_encoded_array<IT>::values_layout
586 dictionary_encoded_array<IT>::create_values_layout(arrow_proxy& proxy)
587 {
588 const auto& dictionary = proxy.dictionary();
589 SPARROW_ASSERT_TRUE(dictionary);
590 arrow_proxy ar_dictionary{&(dictionary->array()), &(dictionary->schema())};
591 return array_factory(std::move(ar_dictionary));
592 }
593
594 template <std::integral IT>
595 auto dictionary_encoded_array<IT>::create_keys_layout(arrow_proxy& proxy) -> keys_layout
596 {
597 return keys_layout{arrow_proxy{&proxy.array(), &proxy.schema()}};
598 }
599
600 template <std::integral IT>
601 auto dictionary_encoded_array<IT>::get_arrow_proxy() -> arrow_proxy&
602 {
603 return m_proxy;
604 }
605
606 template <std::integral IT>
607 auto dictionary_encoded_array<IT>::get_arrow_proxy() const -> const arrow_proxy&
608 {
609 return m_proxy;
610 }
611
612 template <class IT>
614 {
615 return std::ranges::equal(lhs, rhs);
616 }
617}
618
619#if defined(__cpp_lib_format)
620template <std::integral IT>
621struct std::formatter<sparrow::dictionary_encoded_array<IT>>
622{
623 constexpr auto parse(std::format_parse_context& ctx)
624 {
625 return ctx.begin(); // Simple implementation
626 }
627
628 auto format(const sparrow::dictionary_encoded_array<IT>& ar, std::format_context& ctx) const
629 {
630 std::format_to(ctx.out(), "Dictionary [size={}] <", ar.size());
631 std::for_each(
632 ar.cbegin(),
633 std::prev(ar.cend()),
634 [&ctx](const auto& value)
635 {
636 std::format_to(ctx.out(), "{}, ", value);
637 }
638 );
639 std::format_to(ctx.out(), "{}>", ar.back());
640 return ctx.out();
641 }
642};
643
644template <std::integral IT>
645std::ostream& operator<<(std::ostream& os, const sparrow::dictionary_encoded_array<IT>& value)
646{
647 os << std::format("{}", value);
648 return os;
649}
650#endif
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:39
Proxy class over ArrowArray and ArrowSchema.
Object that owns a piece of contiguous memory.
Definition buffer.hpp:112
self_type & operator=(const self_type &)
functor_index_iterator< const_functor_type > const_iterator
self_type slice_view(size_type start, size_type end) const
Slices the array to keep only the elements between the given start and end.
std::optional< key_value_view > metadata() const
layout_element_functor< self_type, true > functor_type
std::optional< std::string_view > name() const
layout_element_functor< self_type, true > const_functor_type
self_type slice(size_type start, size_type end) const
Slices the array to keep only the elements between the given start and end.
functor_index_iterator< functor_type > iterator
const_reference operator[](size_type i) const
constexpr layout_element_functor(storage_type layout_)
std:: conditional_t< is_const, typename layout_type::const_reference, typename layout_type::reference > return_type
constexpr layout_element_functor()=default
std::conditional_t< is_const, const layout_type *, layout_type > storage_type
return_type operator()(std::size_t i) const
The nullable class models a value or a reference that can be "null", or missing, like values traditio...
Definition nullable.hpp:281
A view that repeats a value a given number of times.
This buffer class is use as storage buffer for all sparrow arrays.
Definition u8_buffer.hpp:75
#define SPARROW_ASSERT_TRUE(expr__)
#define SPARROW_ASSERT_FALSE(expr__)
constexpr bool excludes_copy_and_move_ctor_v
Definition mp_utils.hpp:507
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr std::string_view data_type_format_of()
SPARROW_API bool operator==(const array &lhs, const array &rhs)
Compares the content of two arrays.
constexpr bool is_dictionary_encoded_array_v
Checks whether T is a dictionary_encoded_array type.
SPARROW_API void swap(ArrowArray &lhs, ArrowArray &rhs)
Swaps the contents of the two ArrowArray objects.
SPARROW_API array_traits::inner_value_type array_default_element_value(const array_wrapper &ar)
std::pair< ArrowArray, ArrowSchema > extract_arrow_structures(A &&a)
Extracts the internal ArrowArrays and ArrowSchema structures from the given array or typed layout.
Definition array.hpp:91
primitive_array_impl< T > primitive_array
Array of values of whose type has fixed binary size.
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
constexpr bool data_type_is_integer(data_type dt)
dynamic_bitset< std::uint8_t > validity_bitmap
SPARROW_API cloning_ptr< array_wrapper > array_factory(arrow_proxy proxy)
SPARROW_API array_traits::const_reference array_element(const array_wrapper &ar, std::size_t index)
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
std::ostream & operator<<(std::ostream &os, const sparrow::nullval_t &)
Definition nullable.hpp:933
mpl::rename< mpl::transform< detail::array_const_reference_t, all_base_types_t >, nullable_variant > const_reference
mpl::rename< all_base_types_t, std::variant > inner_value_type
mpl::rename< mpl::transform< detail::array_value_type_t, all_base_types_t >, nullable_variant > value_type
Provides compile-time information about Arrow data types.