sparrow 0.9.0
Loading...
Searching...
No Matches
variable_size_binary_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <numeric>
20#include <optional>
21#include <ranges>
22#include <string>
23#include <vector>
24
36
37namespace sparrow
38{
39 namespace detail
40 {
41 template <class T, class OT>
43
44 template <>
45 struct variable_size_binary_format<std::string, std::int32_t>
46 {
47 [[nodiscard]] static std::string format()
48 {
49 return "u";
50 }
51 };
52
53 template <>
54 struct variable_size_binary_format<std::string, std::int64_t>
55 {
56 [[nodiscard]] static std::string format()
57 {
58 return "U";
59 }
60 };
61
62 template <>
63 struct variable_size_binary_format<std::vector<byte_t>, std::int32_t>
64 {
65 [[nodiscard]] static std::string format()
66 {
67 return "z";
68 }
69 };
70
71 template <>
72 struct variable_size_binary_format<std::vector<byte_t>, std::int64_t>
73 {
74 [[nodiscard]] static std::string format()
75 {
76 return "Z";
77 }
78 };
79 }
80
81 template <std::ranges::sized_range T, class CR, layout_offset OT>
82 class variable_size_binary_array_impl;
83
85
90 binary_traits::value_type,
91 binary_traits::const_reference,
92 std::int64_t>;
93
94 namespace detail
95 {
96 template <class T>
97 struct get_data_type_from_array;
98
99 template <>
101 {
102 [[nodiscard]] static constexpr sparrow::data_type get()
103 {
105 }
106 };
107
108 template <>
110 {
111 [[nodiscard]] static constexpr sparrow::data_type get()
112 {
114 }
115 };
116
117 template <>
119 {
120 [[nodiscard]] static constexpr sparrow::data_type get()
121 {
123 }
124 };
125
126 template <>
128 {
129 [[nodiscard]] static constexpr sparrow::data_type get()
130 {
132 }
133 };
134 }
135
139 template <class T>
140 constexpr bool is_string_array_v = std::same_as<T, string_array>;
141
145 template <class T>
146 constexpr bool is_big_string_array_v = std::same_as<T, big_string_array>;
147
151 template <class T>
152 constexpr bool is_binary_array_v = std::same_as<T, binary_array>;
153
157 template <class T>
158 constexpr bool is_big_binary_array_v = std::same_as<T, big_binary_array>;
159
160 template <std::ranges::sized_range T, class CR, layout_offset OT>
162 {
164
168 using offset_type = OT;
169
170 using data_value_type = typename T::value_type;
171
172 using offset_iterator = OT*;
173 using const_offset_iterator = const OT*;
174
177
178 using iterator_tag = std::random_access_iterator_tag;
179
181
190
192
201
203
204 // using iterator = layout_iterator<array_type, false>;
205 // using const_iterator = layout_iterator<array_type, true, CR>;
206 };
207
208 template <std::ranges::sized_range T, class CR, layout_offset OT>
210 : public mutable_array_bitmap_base<variable_size_binary_array_impl<T, CR, OT>>
211 {
212 private:
213
214 static_assert(
215 sizeof(std::ranges::range_value_t<T>) == sizeof(std::uint8_t),
216 "Only sequences of types with the same size as uint8_t are supported"
217 );
218
219 public:
220
223
225 using inner_value_type = typename inner_types::inner_value_type;
226 using inner_reference = typename inner_types::inner_reference;
227 using inner_const_reference = typename inner_types::inner_const_reference;
228
229 using offset_type = typename inner_types::offset_type;
234
236 using bitmap_reference = typename base_type::bitmap_reference;
239
243
244 using offset_iterator = typename inner_types::offset_iterator;
245 using const_offset_iterator = typename inner_types::const_offset_iterator;
246
250 using data_iterator = typename inner_types::data_iterator;
251
252 using const_data_iterator = typename inner_types::const_data_iterator;
253 using data_value_type = typename inner_types::data_value_type;
254
255 using value_iterator = typename inner_types::value_iterator;
256 using const_value_iterator = typename inner_types::const_value_iterator;
257
259
260 template <class... ARGS>
263 : self_type(create_proxy(std::forward<ARGS>(args)...))
264 {
265 }
266
267 using base_type::get_arrow_proxy;
268 using base_type::size;
269
271 [[nodiscard]] inner_const_reference value(size_type i) const;
272
273 template <std::ranges::range SIZES_RANGE>
274 [[nodiscard]] static auto offset_from_sizes(SIZES_RANGE&& sizes) -> offset_buffer_type;
275
276 private:
277
278 template <
281 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
282 [[nodiscard]] static arrow_proxy create_proxy(
283 u8_buffer<C>&& data_buffer,
284 offset_buffer_type&& list_offsets,
285 VB&& validity_input = validity_bitmap{},
286 std::optional<std::string_view> name = std::nullopt,
287 std::optional<METADATA_RANGE> metadata = std::nullopt
288 );
289
290 template <
291 std::ranges::input_range R,
293 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
294 requires(
295 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
297 // range of
298 // char-like
299 )
300 [[nodiscard]] static arrow_proxy create_proxy(
301 R&& values,
302 VB&& validity_input = validity_bitmap{},
303 std::optional<std::string_view> name = std::nullopt,
304 std::optional<METADATA_RANGE> metadata = std::nullopt
305 );
306
307 template <
308 std::ranges::input_range R,
309 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
310 requires(
311 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
312 mpl::char_like<std::ranges::range_value_t<std::ranges::range_value_t<R>>> // inner range is a
313 // range of
314 // char-like
315 )
316 [[nodiscard]] static arrow_proxy create_proxy(
317 R&& values,
318 bool nullable,
319 std::optional<std::string_view> name = std::nullopt,
320 std::optional<METADATA_RANGE> metadata = std::nullopt
321 );
322
323 // range of nullable values
324 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
325 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
326 [[nodiscard]] static arrow_proxy create_proxy(
327 R&&,
328 std::optional<std::string_view> name = std::nullopt,
329 std::optional<METADATA_RANGE> metadata = std::nullopt
330 );
331
332 template <mpl::char_like C, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
333 [[nodiscard]] static arrow_proxy create_proxy_impl(
334 u8_buffer<C>&& data_buffer,
335 offset_buffer_type&& list_offsets,
336 std::optional<validity_bitmap>&&,
337 std::optional<std::string_view> name = std::nullopt,
338 std::optional<METADATA_RANGE> metadata = std::nullopt
339 );
340
341 static constexpr size_t OFFSET_BUFFER_INDEX = 1;
342 static constexpr size_t DATA_BUFFER_INDEX = 2;
343
344 [[nodiscard]] offset_iterator offset(size_type i);
345 [[nodiscard]] offset_iterator offsets_begin();
346 [[nodiscard]] offset_iterator offsets_end();
347 [[nodiscard]] data_iterator data(size_type i);
348
349 [[nodiscard]] value_iterator value_begin();
350 [[nodiscard]] value_iterator value_end();
351
352 [[nodiscard]] const_value_iterator value_cbegin() const;
353 [[nodiscard]] const_value_iterator value_cend() const;
354
355 [[nodiscard]] const_offset_iterator offset(size_type i) const;
356 [[nodiscard]] const_offset_iterator offsets_cbegin() const;
357 [[nodiscard]] const_offset_iterator offsets_cend() const;
358 [[nodiscard]] const_data_iterator data(size_type i) const;
359
360 // Modifiers
361
362 template <std::ranges::sized_range U>
363 requires mpl::convertible_ranges<U, T>
364 void resize_values(size_type new_length, U value);
365
366 void resize_offsets(size_type new_length, offset_type offset_value);
367
368 template <std::ranges::sized_range U>
369 requires mpl::convertible_ranges<U, T>
370 value_iterator insert_value(const_value_iterator pos, U value, size_type count);
371
372 offset_iterator insert_offset(const_offset_iterator pos, offset_type size, size_type count);
373
374 template <mpl::iterator_of_type<T> InputIt>
375 value_iterator insert_values(const_value_iterator pos, InputIt first, InputIt last);
376
377 template <mpl::iterator_of_type<OT> InputIt>
378 offset_iterator insert_offsets(const_offset_iterator pos, InputIt first, InputIt last);
379
380 value_iterator erase_values(const_value_iterator pos, size_type count);
381
382 offset_iterator erase_offsets(const_offset_iterator pos, size_type count);
383
384 template <std::ranges::sized_range U>
385 requires mpl::convertible_ranges<U, T>
386 void assign(U&& rhs, size_type index);
387
390 friend base_type;
393 };
394
395 /*********************************************
396 * variable_size_binary_array_impl implementation *
397 *********************************************/
398
399 template <std::ranges::sized_range T, class CR, layout_offset OT>
401 : base_type(std::move(proxy))
402 {
403 const auto type = this->get_arrow_proxy().data_type();
406 || type == data_type::LARGE_BINARY
407 );
409 (((type == data_type::STRING || type == data_type::BINARY) && std::same_as<OT, int32_t>)
410 || ((type == data_type::LARGE_STRING || type == data_type::LARGE_BINARY)
411 && std::same_as<OT, int64_t>) )
412 );
413 }
414
415 template <std::ranges::sized_range T, class CR, layout_offset OT>
416 template <std::ranges::range SIZES_RANGE>
419 {
421 std::forward<SIZES_RANGE>(sizes)
422 );
423 }
424
425 template <std::ranges::sized_range T, class CR, layout_offset OT>
426 template <mpl::char_like C, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
427 arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy(
428 u8_buffer<C>&& data_buffer,
429 offset_buffer_type&& offsets,
430 VB&& validity_input,
431 std::optional<std::string_view> name,
432 std::optional<METADATA_RANGE> metadata
433 )
434 {
435 const auto size = offsets.size() - 1;
436 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
437 const auto null_count = vbitmap.null_count();
438
441 std::move(name), // name
442 std::move(metadata), // metadata
443 std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE}), // flags,
444 nullptr, // children
445 repeat_view<bool>(true, 0),
446 nullptr, // dictionary
447 true
448
449 );
450 std::vector<buffer<std::uint8_t>> arr_buffs = {
451 std::move(vbitmap).extract_storage(),
452 std::move(offsets).extract_storage(),
453 std::move(data_buffer).extract_storage()
454 };
455
456 ArrowArray arr = make_arrow_array(
457 static_cast<std::int64_t>(size), // length
458 static_cast<int64_t>(null_count),
459 0, // offset
460 std::move(arr_buffs),
461 nullptr, // children
462 repeat_view<bool>(true, 0),
463 nullptr, // dictionary
464 true
465 );
466 return arrow_proxy{std::move(arr), std::move(schema)};
467 }
468
469 template <std::ranges::sized_range T, class CR, layout_offset OT>
470 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
471 requires(
472 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
474 // range of char-like
475 )
476 arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy(
477 R&& values,
478 VB&& validity_input,
479 std::optional<std::string_view> name,
480 std::optional<METADATA_RANGE> metadata
481 )
482 {
483 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
484
485 auto size_range = values
486 | std::views::transform(
487 [](const auto& v)
488 {
489 return std::ranges::size(v);
490 }
491 );
492 auto offset_buffer = offset_from_sizes(size_range);
493 auto data_buffer = u8_buffer<values_inner_value_type>(std::ranges::views::join(values));
494 return create_proxy(
495 std::move(data_buffer),
496 std::move(offset_buffer),
497 std::forward<VB>(validity_input),
498 std::forward<std::optional<std::string_view>>(name),
499 std::forward<std::optional<METADATA_RANGE>>(metadata)
500 );
501 }
502
503 template <std::ranges::sized_range T, class CR, layout_offset OT>
504 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
505 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
506 arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy(
507 R&& range,
508 std::optional<std::string_view> name,
509 std::optional<METADATA_RANGE> metadata
510 )
511 {
512 // split into values and is_non_null ranges
513 const auto values = range
514 | std::views::transform(
515 [](const auto& v)
516 {
517 return v.get();
518 }
519 );
520 const auto is_non_null = range
521 | std::views::transform(
522 [](const auto& v)
523 {
524 return v.has_value();
525 }
526 );
527 return self_type::create_proxy(values, is_non_null, std::move(name), std::move(metadata));
528 }
529
530 template <std::ranges::sized_range T, class CR, layout_offset OT>
531 template <
532 std::ranges::input_range R,
533 input_metadata_container METADATA_RANGE>
534 requires(
535 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
537 // range of
538 // char-like
539 )
540 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy(
541 R&& values,
542 bool nullable,
543 std::optional<std::string_view> name,
544 std::optional<METADATA_RANGE> metadata
545 )
546 {
547 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
548 const size_t size = std::ranges::size(values);
549 u8_buffer<values_inner_value_type> data_buffer(std::ranges::views::join(values));
550 auto size_range = values
551 | std::views::transform(
552 [](const auto& v)
553 {
554 return std::ranges::size(v);
555 }
556 );
557 auto offset_buffer = offset_from_sizes(size_range);
558 return create_proxy_impl(
559 std::move(data_buffer),
560 std::move(offset_buffer),
561 nullable ? std::make_optional<validity_bitmap>(nullptr, size) : std::nullopt,
562 std::move(name),
563 std::move(metadata)
564 );
565 }
566
567 template <std::ranges::sized_range T, class CR, layout_offset OT>
568 template <mpl::char_like C, input_metadata_container METADATA_RANGE>
569 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy_impl(
570 u8_buffer<C>&& data_buffer,
571 offset_buffer_type&& list_offsets,
572 std::optional<validity_bitmap>&& bitmap,
573 std::optional<std::string_view> name,
574 std::optional<METADATA_RANGE> metadata
575 )
576 {
577 const auto size = list_offsets.size() - 1;
578 const auto null_count = bitmap.has_value() ? bitmap->null_count() : 0;
579
580 const std::optional<std::unordered_set<sparrow::ArrowFlag>>
581 flags = bitmap.has_value()
582 ? std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE})
583 : std::nullopt;
584
585 ArrowSchema schema = make_arrow_schema(
587 std::move(name), // name
588 std::move(metadata), // metadata
589 flags, // flags,
590 nullptr, // children
591 repeat_view<bool>(true, 0),
592 nullptr, // dictionary
593 true
594
595 );
596 std::vector<buffer<std::uint8_t>> arr_buffs = {
597 bitmap.has_value() ? std::move(*bitmap).extract_storage() : buffer<std::uint8_t>{nullptr, 0},
598 std::move(list_offsets).extract_storage(),
599 std::move(data_buffer).extract_storage()
600 };
601
602 ArrowArray arr = make_arrow_array(
603 static_cast<std::int64_t>(size), // length
604 static_cast<int64_t>(null_count),
605 0, // offset
606 std::move(arr_buffs),
607 nullptr, // children
608 repeat_view<bool>(true, 0),
609 nullptr, // dictionary
610 true
611 );
612 return arrow_proxy{std::move(arr), std::move(schema)};
613 }
614
615 template <std::ranges::sized_range T, class CR, layout_offset OT>
616 auto variable_size_binary_array_impl<T, CR, OT>::data(size_type i) -> data_iterator
617 {
618 arrow_proxy& proxy = get_arrow_proxy();
619 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
620 return proxy.buffers()[DATA_BUFFER_INDEX].template data<data_value_type>() + i;
621 }
622
623 template <std::ranges::sized_range T, class CR, layout_offset OT>
624 auto variable_size_binary_array_impl<T, CR, OT>::data(size_type i) const -> const_data_iterator
625 {
626 const arrow_proxy& proxy = this->get_arrow_proxy();
627 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
628 return proxy.buffers()[DATA_BUFFER_INDEX].template data<const data_value_type>() + i;
629 }
630
631 template <std::ranges::sized_range T, class CR, layout_offset OT>
632 template <std::ranges::sized_range U>
634 void variable_size_binary_array_impl<T, CR, OT>::assign(U&& rhs, size_type index)
635 {
636 SPARROW_ASSERT_TRUE(index < size());
637 const auto offset_beg = *offset(index);
638 const auto offset_end = *offset(index + 1);
639 const auto initial_value_length = offset_end - offset_beg;
640 const auto new_value_length = static_cast<OT>(std::ranges::size(rhs));
641 const OT shift_byte_count = new_value_length - initial_value_length;
642 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
643 if (shift_byte_count != 0)
644 {
645 const auto shift_val_abs = static_cast<size_type>(std::abs(shift_byte_count));
646 const auto new_data_buffer_size = shift_byte_count < 0 ? data_buffer.size() - shift_val_abs
647 : data_buffer.size() + shift_val_abs;
648
649 if (shift_byte_count > 0)
650 {
651 data_buffer.resize(new_data_buffer_size);
652 // Move elements to make space for the new value
653 std::move_backward(
654 sparrow::next(data_buffer.begin(), offset_end),
655 sparrow::next(data_buffer.end(), -shift_byte_count),
656 data_buffer.end()
657 );
658 }
659 else
660 {
661 std::move(
662 sparrow::next(data_buffer.begin(), offset_end),
663 data_buffer.end(),
664 sparrow::next(data_buffer.begin(), offset_end + shift_byte_count)
665 );
666 data_buffer.resize(new_data_buffer_size);
667 }
668 // Adjust offsets for subsequent elements
669 std::for_each(
670 offset(index + 1),
671 offset(size() + 1),
672 [shift_byte_count](auto& offset)
673 {
674 offset += shift_byte_count;
675 }
676 );
677 }
678 auto tmp = std::views::transform(
679 rhs,
680 [](const auto& val)
681 {
682 return static_cast<std::uint8_t>(val);
683 }
684 );
685 // Copy the new value into the buffer
686 std::copy(std::ranges::begin(tmp), std::ranges::end(tmp), sparrow::next(data_buffer.begin(), offset_beg));
687 }
688
689 template <std::ranges::sized_range T, class CR, layout_offset OT>
690 auto variable_size_binary_array_impl<T, CR, OT>::offset(size_type i) -> offset_iterator
691 {
692 SPARROW_ASSERT_TRUE(i <= size() + this->get_arrow_proxy().offset());
693 return get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
694 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
695 }
696
697 template <std::ranges::sized_range T, class CR, layout_offset OT>
698 auto variable_size_binary_array_impl<T, CR, OT>::offset(size_type i) const -> const_offset_iterator
699 {
700 SPARROW_ASSERT_TRUE(i <= this->size() + this->get_arrow_proxy().offset());
701 return this->get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
702 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
703 }
704
705 template <std::ranges::sized_range T, class CR, layout_offset OT>
706 auto variable_size_binary_array_impl<T, CR, OT>::offsets_begin() -> offset_iterator
707 {
708 return offset(0);
709 }
710
711 template <std::ranges::sized_range T, class CR, layout_offset OT>
712 auto variable_size_binary_array_impl<T, CR, OT>::offsets_cbegin() const -> const_offset_iterator
713 {
714 return offset(0);
715 }
716
717 template <std::ranges::sized_range T, class CR, layout_offset OT>
718 auto variable_size_binary_array_impl<T, CR, OT>::offsets_end() -> offset_iterator
719 {
720 return offset(size() + 1);
721 }
722
723 template <std::ranges::sized_range T, class CR, layout_offset OT>
724 auto variable_size_binary_array_impl<T, CR, OT>::offsets_cend() const -> const_offset_iterator
725 {
726 return offset(size() + 1);
727 }
728
729 template <std::ranges::sized_range T, class CR, layout_offset OT>
735
736 template <std::ranges::sized_range T, class CR, layout_offset OT>
738 {
739 SPARROW_ASSERT_TRUE(i < this->size());
740 const OT offset_begin = *offset(i);
741 SPARROW_ASSERT_TRUE(offset_begin >= 0);
742 const OT offset_end = *offset(i + 1);
743 SPARROW_ASSERT_TRUE(offset_end >= 0);
744 const const_data_iterator pointer_begin = data(static_cast<size_type>(offset_begin));
745 const const_data_iterator pointer_end = data(static_cast<size_type>(offset_end));
746 return inner_const_reference(pointer_begin, pointer_end);
747 }
748
749 template <std::ranges::sized_range T, class CR, layout_offset OT>
750 auto variable_size_binary_array_impl<T, CR, OT>::value_begin() -> value_iterator
751 {
752 return value_iterator{this, 0};
753 }
754
755 template <std::ranges::sized_range T, class CR, layout_offset OT>
756 auto variable_size_binary_array_impl<T, CR, OT>::value_end() -> value_iterator
757 {
758 return sparrow::next(value_begin(), size());
759 }
760
761 template <std::ranges::sized_range T, class CR, layout_offset OT>
762 auto variable_size_binary_array_impl<T, CR, OT>::value_cbegin() const -> const_value_iterator
763 {
764 return const_value_iterator{this, 0};
765 }
766
767 template <std::ranges::sized_range T, class CR, layout_offset OT>
768 auto variable_size_binary_array_impl<T, CR, OT>::value_cend() const -> const_value_iterator
769 {
770 return sparrow::next(value_cbegin(), this->size());
771 }
772
773 template <std::ranges::sized_range T, class CR, layout_offset OT>
774 template <std::ranges::sized_range U>
776 void variable_size_binary_array_impl<T, CR, OT>::resize_values(size_type new_length, U value)
777 {
778 const size_t new_size = new_length + static_cast<size_t>(this->get_arrow_proxy().offset());
779 auto& buffers = this->get_arrow_proxy().get_array_private_data()->buffers();
780 if (new_length < size())
781 {
782 const auto offset_begin = static_cast<size_t>(*offset(new_length));
783 auto& data_buffer = buffers[DATA_BUFFER_INDEX];
784 data_buffer.resize(offset_begin);
785 auto& offset_buffer = buffers[OFFSET_BUFFER_INDEX];
786 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
787 offset_buffer_adaptor.resize(new_size + 1);
788 }
789 else if (new_length > size())
790 {
791 insert_value(value_cend(), value, new_length - size());
792 }
793 }
794
795 template <std::ranges::sized_range T, class CR, layout_offset OT>
796 template <std::ranges::sized_range U>
798 auto
799 variable_size_binary_array_impl<T, CR, OT>::insert_value(const_value_iterator pos, U value, size_type count)
801 {
802 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
803 const OT offset_begin = *offset(idx);
804 const std::vector<uint8_t> casted_value{value.cbegin(), value.cend()};
805 const repeat_view<std::vector<uint8_t>> my_repeat_view{casted_value, count};
806 const auto joined_repeated_value_range = std::ranges::views::join(my_repeat_view);
807 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
808 const auto pos_to_insert = sparrow::next(data_buffer.cbegin(), offset_begin);
809 data_buffer.insert(pos_to_insert, joined_repeated_value_range.begin(), joined_repeated_value_range.end());
810 insert_offset(offsets_cbegin() + idx + 1, static_cast<offset_type>(value.size()), count);
811 return sparrow::next(value_begin(), idx);
812 }
813
814 template <std::ranges::sized_range T, class CR, layout_offset OT>
815 auto variable_size_binary_array_impl<T, CR, OT>::insert_offset(
816 const_offset_iterator pos,
817 offset_type value_size,
818 size_type count
819 ) -> offset_iterator
820 {
821 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
822 const auto idx = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
823 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
824 const offset_type cumulative_size = value_size * static_cast<offset_type>(count);
825 // Adjust offsets for subsequent elements
826 std::for_each(
827 sparrow::next(offset_buffer_adaptor.begin(), idx + 1),
828 offset_buffer_adaptor.end(),
829 [cumulative_size](auto& offset)
830 {
831 offset += cumulative_size;
832 }
833 );
834 offset_buffer_adaptor.insert(sparrow::next(offset_buffer_adaptor.cbegin(), idx + 1), count, 0);
835 // Put the right values in the new offsets
836 for (size_t i = idx + 1; i < idx + 1 + count; ++i)
837 {
838 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + value_size;
839 }
840 return offsets_begin() + idx;
841 }
842
843 template <std::ranges::sized_range T, class CR, layout_offset OT>
844 template <mpl::iterator_of_type<T> InputIt>
845 auto
846 variable_size_binary_array_impl<T, CR, OT>::insert_values(const_value_iterator pos, InputIt first, InputIt last)
848 {
849 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
850 auto data_buffer_adaptor = make_buffer_adaptor<data_value_type>(data_buffer);
851 auto values = std::ranges::subrange(first, last);
852 const size_t cumulative_sizes = std::accumulate(
853 values.begin(),
854 values.end(),
855 size_t(0),
856 [](size_t acc, const T& value)
857 {
858 return acc + value.size();
859 }
860 );
861 data_buffer_adaptor.resize(data_buffer_adaptor.size() + cumulative_sizes);
862 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
863 const OT offset_begin = *offset(idx);
864 auto insert_pos = sparrow::next(data_buffer_adaptor.begin(), offset_begin);
865
866 // Move elements to make space for the new value
867 std::move_backward(
868 insert_pos,
869 sparrow::next(data_buffer_adaptor.end(), -static_cast<difference_type>(cumulative_sizes)),
870 data_buffer_adaptor.end()
871 );
872
873 for (const T& value : values)
874 {
875 std::copy(value.begin(), value.end(), insert_pos);
876 std::advance(insert_pos, value.size());
877 }
878
879 const auto sizes_of_each_value = std::ranges::views::transform(
880 values,
881 [](const T& value) -> offset_type
882 {
883 return static_cast<offset_type>(value.size());
884 }
885 );
886 insert_offsets(offset(idx + 1), sizes_of_each_value.begin(), sizes_of_each_value.end());
887 return sparrow::next(value_begin(), idx);
888 }
889
890 template <std::ranges::sized_range T, class CR, layout_offset OT>
891 template <mpl::iterator_of_type<OT> InputIt>
892 auto variable_size_binary_array_impl<T, CR, OT>::insert_offsets(
894 InputIt first_sizes,
895 InputIt last_sizes
896 ) -> offset_iterator
897 {
898 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
899 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
900 SPARROW_ASSERT_TRUE(first_sizes <= last_sizes);
901 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
902 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
903 const auto idx = std::distance(offsets_cbegin(), pos);
904 const OT cumulative_sizes = std::reduce(first_sizes, last_sizes, OT(0));
905 const auto sizes_count = std::distance(first_sizes, last_sizes);
906 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() + static_cast<size_t>(sizes_count));
907 // Move the offsets to make space for the new offsets
908 std::move_backward(
909 offset_buffer_adaptor.begin() + idx,
910 offset_buffer_adaptor.end() - sizes_count,
911 offset_buffer_adaptor.end()
912 );
913 // Adjust offsets for subsequent elements
914 std::for_each(
915 offset_buffer_adaptor.begin() + idx + sizes_count,
916 offset_buffer_adaptor.end(),
917 [cumulative_sizes](auto& offset)
918 {
919 offset += cumulative_sizes;
920 }
921 );
922 // Put the right values in the new offsets
923 InputIt it = first_sizes;
924 for (size_t i = static_cast<size_t>(idx + 1); i < static_cast<size_t>(idx + sizes_count + 1); ++i)
925 {
926 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + *it;
927 ++it;
928 }
929 return offset(static_cast<size_t>(idx));
930 }
931
932 template <std::ranges::sized_range T, class CR, layout_offset OT>
933 auto variable_size_binary_array_impl<T, CR, OT>::erase_values(const_value_iterator pos, size_type count)
934 -> value_iterator
935 {
936 SPARROW_ASSERT_TRUE(pos >= value_cbegin());
937 SPARROW_ASSERT_TRUE(pos <= value_cend());
938 const size_t index = static_cast<size_t>(std::distance(value_cbegin(), pos));
939 if (count == 0)
940 {
941 return sparrow::next(value_begin(), index);
942 }
943 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
944 const auto offset_begin = *offset(index);
945 const auto offset_end = *offset(index + count);
946 const size_t difference = static_cast<size_t>(offset_end - offset_begin);
947 // move the values after the erased ones
948 std::move(data_buffer.begin() + offset_end, data_buffer.end(), data_buffer.begin() + offset_begin);
949 data_buffer.resize(data_buffer.size() - difference);
950 // adjust the offsets for the subsequent elements
951 erase_offsets(offset(index), count);
952 return sparrow::next(value_begin(), index);
953 }
954
955 template <std::ranges::sized_range T, class CR, layout_offset OT>
956 auto variable_size_binary_array_impl<T, CR, OT>::erase_offsets(const_offset_iterator pos, size_type count)
957 -> offset_iterator
958 {
959 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
960 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
961 const size_t index = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
962 if (count == 0)
963 {
964 return offset(index);
965 }
966 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
967 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
968 const OT offset_start_value = *offset(index);
969 const OT offset_end_value = *offset(index + count);
970 const OT difference = offset_end_value - offset_start_value;
971 // move the offsets after the erased ones
972 std::move(
973 sparrow::next(offset_buffer_adaptor.begin(), index + count + 1),
974 offset_buffer_adaptor.end(),
975 sparrow::next(offset_buffer_adaptor.begin(), index + 1)
976 );
977 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() - count);
978 // adjust the offsets for the subsequent elements
979 std::for_each(
980 sparrow::next(offset_buffer_adaptor.begin(), index + 1),
981 offset_buffer_adaptor.end(),
982 [difference](OT& offset)
983 {
984 offset -= difference;
985 }
986 );
987 return offset(index);
988 }
989
990}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::iterator_tag iterator_tag
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
Proxy class over ArrowArray and ArrowSchema.
Object that owns a piece of contiguous memory.
Definition buffer.hpp:112
constexpr size_type null_count() const noexcept
bitset_iterator< self_type, true > const_iterator
The nullable class models a value or a reference that can be "null", or missing, like values traditio...
Definition nullable.hpp:281
A view that repeats a value a given number of times.
This buffer class is use as storage buffer for all sparrow arrays.
Definition u8_buffer.hpp:75
static auto offset_from_sizes(SIZES_RANGE &&sizes) -> offset_buffer_type
inner_const_reference value(size_type i) const
Implementation of reference to inner type used for layout L.
Iterator over the data values of a variable size binary layout.
Concept for iterator types.
Matches range types From whose elements are convertible to elements of range type To.
Definition mp_utils.hpp:450
#define SPARROW_ASSERT_TRUE(expr__)
sparrow::u8_buffer< OFFSET_TYPE > offset_buffer_from_sizes(SIZES_RANGE &&sizes)
constexpr std::size_t size(typelist< T... >={})
Definition mp_utils.hpp:107
constexpr bool excludes_copy_and_move_ctor_v
Definition mp_utils.hpp:507
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient typedef to be used as a crtp base class for arrays using a mutable validity buffer.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_big_string_array_v
Checks whether T is a big_string_array type.
constexpr bool is_string_array_v
Checks whether T is a string_array type.
variable_size_binary_array_impl< binary_traits::value_type, binary_traits::const_reference, std::int32_t > binary_array
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
variable_size_binary_array_impl< std::string, std::string_view, std::int32_t > string_array
constexpr bool is_big_binary_array_v
Checks whether T is a big_binary_array type.
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
constexpr bool is_binary_array_v
Checks whether T is a binary_array type.
auto make_buffer_adaptor(FromBufferRef &buf)
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
variable_size_binary_array_impl< std::string, std::string_view, std::int64_t > big_string_array
variable_size_binary_array_impl< binary_traits::value_type, binary_traits::const_reference, std::int64_t > big_binary_array
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
arrow_traits< std::vector< byte_t > > binary_traits
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
variable_size_binary_value_iterator< array_type, iterator_types > value_iterator
variable_size_binary_value_iterator< array_type, const_iterator_types > const_value_iterator
Base class for array_inner_types specialization.
Traits class that must be specialized by array classes inheriting from array_crtp_base.
Provides compile-time information about Arrow data types.