sparrow 0.6.0
Loading...
Searching...
No Matches
variable_size_binary_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <numeric>
20#include <ranges>
21#include <string>
22#include <vector>
23
34
35namespace sparrow
36{
37 namespace detail
38 {
39 template <class T, class OT>
41
42 template <>
43 struct variable_size_binary_format<std::string, std::int32_t>
44 {
45 [[nodiscard]] static std::string format()
46 {
47 return "u";
48 }
49 };
50
51 template <>
52 struct variable_size_binary_format<std::string, std::int64_t>
53 {
54 [[nodiscard]] static std::string format()
55 {
56 return "U";
57 }
58 };
59
60 template <>
61 struct variable_size_binary_format<std::vector<byte_t>, std::int32_t>
62 {
63 [[nodiscard]] static std::string format()
64 {
65 return "z";
66 }
67 };
68
69 template <>
70 struct variable_size_binary_format<std::vector<byte_t>, std::int64_t>
71 {
72 [[nodiscard]] static std::string format()
73 {
74 return "Z";
75 }
76 };
77 }
78
79 template <std::ranges::sized_range T, class CR, layout_offset OT>
80 class variable_size_binary_array_impl;
81
83
88 binary_traits::value_type,
89 binary_traits::const_reference,
90 std::int64_t>;
91
92 namespace detail
93 {
94 template <class T>
95 struct get_data_type_from_array;
96
97 template <>
99 {
100 [[nodiscard]] static constexpr sparrow::data_type get()
101 {
103 }
104 };
105
106 template <>
108 {
109 [[nodiscard]] static constexpr sparrow::data_type get()
110 {
112 }
113 };
114
115 template <>
117 {
118 [[nodiscard]] static constexpr sparrow::data_type get()
119 {
121 }
122 };
123
124 template <>
126 {
127 [[nodiscard]] static constexpr sparrow::data_type get()
128 {
130 }
131 };
132 }
133
137 template <class T>
138 constexpr bool is_string_array_v = std::same_as<T, string_array>;
139
143 template <class T>
144 constexpr bool is_big_string_array_v = std::same_as<T, big_string_array>;
145
149 template <class T>
150 constexpr bool is_binary_array_v = std::same_as<T, binary_array>;
151
155 template <class T>
156 constexpr bool is_big_binary_array_v = std::same_as<T, big_binary_array>;
157
158 template <std::ranges::sized_range T, class CR, layout_offset OT>
160 {
162
166 using offset_type = OT;
167
168 using data_value_type = typename T::value_type;
169
170 using offset_iterator = OT*;
171 using const_offset_iterator = const OT*;
172
175
176 using iterator_tag = std::random_access_iterator_tag;
177
179
188
190
199
201
202 // using iterator = layout_iterator<array_type, false>;
203 // using const_iterator = layout_iterator<array_type, true, CR>;
204 };
205
206 template <std::ranges::sized_range T, class CR, layout_offset OT>
208 : public mutable_array_bitmap_base<variable_size_binary_array_impl<T, CR, OT>>
209 {
210 private:
211
212 static_assert(
213 sizeof(std::ranges::range_value_t<T>) == sizeof(std::uint8_t),
214 "Only sequences of types with the same size as uint8_t are supported"
215 );
216
217 public:
218
221
223 using inner_value_type = typename inner_types::inner_value_type;
224 using inner_reference = typename inner_types::inner_reference;
225 using inner_const_reference = typename inner_types::inner_const_reference;
226
227 using offset_type = typename inner_types::offset_type;
232
234 using bitmap_reference = typename base_type::bitmap_reference;
237
241
242 using offset_iterator = typename inner_types::offset_iterator;
243 using const_offset_iterator = typename inner_types::const_offset_iterator;
244
248 using data_iterator = typename inner_types::data_iterator;
249
250 using const_data_iterator = typename inner_types::const_data_iterator;
251 using data_value_type = typename inner_types::data_value_type;
252
253 using value_iterator = typename inner_types::value_iterator;
254 using const_value_iterator = typename inner_types::const_value_iterator;
255
257
258 template <class... ARGS>
261 : self_type(create_proxy(std::forward<ARGS>(args)...))
262 {
263 }
264
265 using base_type::get_arrow_proxy;
266 using base_type::size;
267
269 [[nodiscard]] inner_const_reference value(size_type i) const;
270
271 template <std::ranges::range SIZES_RANGE>
272 [[nodiscard]] static auto offset_from_sizes(SIZES_RANGE&& sizes) -> offset_buffer_type;
273
274 private:
275
276 template <
279 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
280 [[nodiscard]] static arrow_proxy create_proxy(
281 u8_buffer<C>&& data_buffer,
282 offset_buffer_type&& list_offsets,
283 VB&& validity_input = validity_bitmap{},
284 std::optional<std::string_view> name = std::nullopt,
285 std::optional<METADATA_RANGE> metadata = std::nullopt
286 );
287
288 template <
289 std::ranges::input_range R,
291 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
292 requires(
293 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
295 // range of
296 // char-like
297 )
298 [[nodiscard]] static arrow_proxy create_proxy(
299 R&& values,
300 VB&& validity_input = validity_bitmap{},
301 std::optional<std::string_view> name = std::nullopt,
302 std::optional<METADATA_RANGE> metadata = std::nullopt
303 );
304
305 // range of nullable values
306 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
307 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
308 [[nodiscard]] static arrow_proxy create_proxy(
309 R&&,
310 std::optional<std::string_view> name = std::nullopt,
311 std::optional<METADATA_RANGE> metadata = std::nullopt
312 );
313
314 static constexpr size_t OFFSET_BUFFER_INDEX = 1;
315 static constexpr size_t DATA_BUFFER_INDEX = 2;
316
317 [[nodiscard]] offset_iterator offset(size_type i);
318 [[nodiscard]] offset_iterator offsets_begin();
319 [[nodiscard]] offset_iterator offsets_end();
320 [[nodiscard]] data_iterator data(size_type i);
321
322 [[nodiscard]] value_iterator value_begin();
323 [[nodiscard]] value_iterator value_end();
324
325 [[nodiscard]] const_value_iterator value_cbegin() const;
326 [[nodiscard]] const_value_iterator value_cend() const;
327
328 [[nodiscard]] const_offset_iterator offset(size_type i) const;
329 [[nodiscard]] const_offset_iterator offsets_cbegin() const;
330 [[nodiscard]] const_offset_iterator offsets_cend() const;
331 [[nodiscard]] const_data_iterator data(size_type i) const;
332
333 // Modifiers
334
335 template <std::ranges::sized_range U>
336 requires mpl::convertible_ranges<U, T>
337 void resize_values(size_type new_length, U value);
338
339 void resize_offsets(size_type new_length, offset_type offset_value);
340
341 template <std::ranges::sized_range U>
342 requires mpl::convertible_ranges<U, T>
343 value_iterator insert_value(const_value_iterator pos, U value, size_type count);
344
345 offset_iterator insert_offset(const_offset_iterator pos, offset_type size, size_type count);
346
347 template <mpl::iterator_of_type<T> InputIt>
348 value_iterator insert_values(const_value_iterator pos, InputIt first, InputIt last);
349
350 template <mpl::iterator_of_type<OT> InputIt>
351 offset_iterator insert_offsets(const_offset_iterator pos, InputIt first, InputIt last);
352
353 value_iterator erase_values(const_value_iterator pos, size_type count);
354
355 offset_iterator erase_offsets(const_offset_iterator pos, size_type count);
356
357 template <std::ranges::sized_range U>
358 requires mpl::convertible_ranges<U, T>
359 void assign(U&& rhs, size_type index);
360
363 friend base_type;
366 };
367
368 /*********************************************
369 * variable_size_binary_array_impl implementation *
370 *********************************************/
371
372 template <std::ranges::sized_range T, class CR, layout_offset OT>
374 : base_type(std::move(proxy))
375 {
376 const auto type = this->get_arrow_proxy().data_type();
377 SPARROW_ASSERT_TRUE(type == data_type::STRING || type == data_type::BINARY); // TODO: Add
378 // data_type::LARGE_STRING
379 // and
380 // data_type::LARGE_BINARY
382 (type == data_type::STRING || type == data_type::BINARY) && std::same_as<OT, int32_t>
383 ) );
384 }
385
386 template <std::ranges::sized_range T, class CR, layout_offset OT>
387 template <std::ranges::range SIZES_RANGE>
390 {
391 return detail::offset_buffer_from_sizes<std::remove_const_t<offset_type>>(std::forward<SIZES_RANGE>(sizes
392 ));
393 }
394
395 template <std::ranges::sized_range T, class CR, layout_offset OT>
396 template <mpl::char_like C, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
397 arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy(
398 u8_buffer<C>&& data_buffer,
399 offset_buffer_type&& offsets,
400 VB&& validity_input,
401 std::optional<std::string_view> name,
402 std::optional<METADATA_RANGE> metadata
403 )
404 {
405 const auto size = offsets.size() - 1;
406 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
407 const auto null_count = vbitmap.null_count();
408
411 std::move(name), // name
412 std::move(metadata), // metadata
413 std::nullopt, // flags,
414 nullptr, // children
415 repeat_view<bool>(true, 0),
416 nullptr, // dictionary
417 true
418
419 );
420 std::vector<buffer<std::uint8_t>> arr_buffs = {
421 std::move(vbitmap).extract_storage(),
422 std::move(offsets).extract_storage(),
423 std::move(data_buffer).extract_storage()
424 };
425
427 static_cast<std::int64_t>(size), // length
428 static_cast<int64_t>(null_count),
429 0, // offset
430 std::move(arr_buffs),
431 nullptr, // children
432 repeat_view<bool>(true, 0),
433 nullptr, // dictionary
434 true
435 );
436 return arrow_proxy{std::move(arr), std::move(schema)};
437 }
438
439 template <std::ranges::sized_range T, class CR, layout_offset OT>
440 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
441 requires(
442 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
444 // range of char-like
445 )
446 arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy(
447 R&& values,
448 VB&& validity_input,
449 std::optional<std::string_view> name,
450 std::optional<METADATA_RANGE> metadata
451 )
452 {
453 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
454
455 auto size_range = values
456 | std::views::transform(
457 [](const auto& v)
458 {
459 return std::ranges::size(v);
460 }
461 );
462 auto offset_buffer = offset_from_sizes(size_range);
463 auto data_buffer = u8_buffer<values_inner_value_type>(std::ranges::views::join(values));
464 return create_proxy(
465 std::move(data_buffer),
466 std::move(offset_buffer),
467 std::forward<VB>(validity_input),
468 std::forward<std::optional<std::string_view>>(name),
469 std::forward<std::optional<METADATA_RANGE>>(metadata)
470 );
471 }
472
473 template <std::ranges::sized_range T, class CR, layout_offset OT>
474 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
475 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
476 arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy(
477 R&& range,
478 std::optional<std::string_view> name,
479 std::optional<METADATA_RANGE> metadata
480 )
481 {
482 // split into values and is_non_null ranges
483 const auto values = range
484 | std::views::transform(
485 [](const auto& v)
486 {
487 return v.get();
488 }
489 );
490 const auto is_non_null = range
491 | std::views::transform(
492 [](const auto& v)
493 {
494 return v.has_value();
495 }
496 );
497 return self_type::create_proxy(values, is_non_null, std::move(name), std::move(metadata));
498 }
499
500 template <std::ranges::sized_range T, class CR, layout_offset OT>
501 auto variable_size_binary_array_impl<T, CR, OT>::data(size_type i) -> data_iterator
502 {
503 arrow_proxy& proxy = get_arrow_proxy();
504 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
505 return proxy.buffers()[DATA_BUFFER_INDEX].template data<data_value_type>() + i;
506 }
507
508 template <std::ranges::sized_range T, class CR, layout_offset OT>
509 auto variable_size_binary_array_impl<T, CR, OT>::data(size_type i) const -> const_data_iterator
510 {
511 const arrow_proxy& proxy = this->get_arrow_proxy();
512 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
513 return proxy.buffers()[DATA_BUFFER_INDEX].template data<const data_value_type>() + i;
514 }
515
516 template <std::ranges::sized_range T, class CR, layout_offset OT>
517 template <std::ranges::sized_range U>
519 void variable_size_binary_array_impl<T, CR, OT>::assign(U&& rhs, size_type index)
520 {
521 SPARROW_ASSERT_TRUE(index < size());
522 const auto offset_beg = *offset(index);
523 const auto offset_end = *offset(index + 1);
524 const auto initial_value_length = offset_end - offset_beg;
525 const auto new_value_length = static_cast<OT>(std::ranges::size(rhs));
526 const OT shift_byte_count = new_value_length - initial_value_length;
527 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
528 if (shift_byte_count != 0)
529 {
530 const auto shift_val_abs = static_cast<size_type>(std::abs(shift_byte_count));
531 const auto new_data_buffer_size = shift_byte_count < 0 ? data_buffer.size() - shift_val_abs
532 : data_buffer.size() + shift_val_abs;
533
534 if (shift_byte_count > 0)
535 {
536 data_buffer.resize(new_data_buffer_size);
537 // Move elements to make space for the new value
538 std::move_backward(
539 data_buffer.begin() + offset_end,
540 data_buffer.end() - shift_byte_count,
541 data_buffer.end()
542 );
543 }
544 else
545 {
546 std::move(
547 data_buffer.begin() + offset_end,
548 data_buffer.end(),
549 data_buffer.begin() + offset_end + shift_byte_count
550 );
551 data_buffer.resize(new_data_buffer_size);
552 }
553 // Adjust offsets for subsequent elements
554 std::for_each(
555 offset(index + 1),
556 offset(size() + 1),
557 [shift_byte_count](auto& offset)
558 {
559 offset += shift_byte_count;
560 }
561 );
562 }
563 auto tmp = std::views::transform(
564 rhs,
565 [](const auto& val)
566 {
567 return static_cast<std::uint8_t>(val);
568 }
569 );
570 // Copy the new value into the buffer
571 std::copy(std::ranges::begin(tmp), std::ranges::end(tmp), data_buffer.begin() + offset_beg);
572 }
573
574 template <std::ranges::sized_range T, class CR, layout_offset OT>
575 auto variable_size_binary_array_impl<T, CR, OT>::offset(size_type i) -> offset_iterator
576 {
577 SPARROW_ASSERT_TRUE(i <= size() + this->get_arrow_proxy().offset());
578 return get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
579 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
580 }
581
582 template <std::ranges::sized_range T, class CR, layout_offset OT>
583 auto variable_size_binary_array_impl<T, CR, OT>::offset(size_type i) const -> const_offset_iterator
584 {
585 SPARROW_ASSERT_TRUE(i <= this->size() + this->get_arrow_proxy().offset());
586 return this->get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
587 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
588 }
589
590 template <std::ranges::sized_range T, class CR, layout_offset OT>
591 auto variable_size_binary_array_impl<T, CR, OT>::offsets_begin() -> offset_iterator
592 {
593 return offset(0);
594 }
595
596 template <std::ranges::sized_range T, class CR, layout_offset OT>
597 auto variable_size_binary_array_impl<T, CR, OT>::offsets_cbegin() const -> const_offset_iterator
598 {
599 return offset(0);
600 }
601
602 template <std::ranges::sized_range T, class CR, layout_offset OT>
603 auto variable_size_binary_array_impl<T, CR, OT>::offsets_end() -> offset_iterator
604 {
605 return offset(size() + 1);
606 }
607
608 template <std::ranges::sized_range T, class CR, layout_offset OT>
609 auto variable_size_binary_array_impl<T, CR, OT>::offsets_cend() const -> const_offset_iterator
610 {
611 return offset(size() + 1);
612 }
613
614 template <std::ranges::sized_range T, class CR, layout_offset OT>
620
621 template <std::ranges::sized_range T, class CR, layout_offset OT>
623 {
624 SPARROW_ASSERT_TRUE(i < this->size());
625 const OT offset_begin = *offset(i);
626 SPARROW_ASSERT_TRUE(offset_begin >= 0);
627 const OT offset_end = *offset(i + 1);
628 SPARROW_ASSERT_TRUE(offset_end >= 0);
629 const const_data_iterator pointer_begin = data(static_cast<size_type>(offset_begin));
630 const const_data_iterator pointer_end = data(static_cast<size_type>(offset_end));
631 return inner_const_reference(pointer_begin, pointer_end);
632 }
633
634 template <std::ranges::sized_range T, class CR, layout_offset OT>
635 auto variable_size_binary_array_impl<T, CR, OT>::value_begin() -> value_iterator
636 {
637 return value_iterator{this, 0};
638 }
639
640 template <std::ranges::sized_range T, class CR, layout_offset OT>
641 auto variable_size_binary_array_impl<T, CR, OT>::value_end() -> value_iterator
642 {
643 return sparrow::next(value_begin(), size());
644 }
645
646 template <std::ranges::sized_range T, class CR, layout_offset OT>
647 auto variable_size_binary_array_impl<T, CR, OT>::value_cbegin() const -> const_value_iterator
648 {
649 return const_value_iterator{this, 0};
650 }
651
652 template <std::ranges::sized_range T, class CR, layout_offset OT>
653 auto variable_size_binary_array_impl<T, CR, OT>::value_cend() const -> const_value_iterator
654 {
655 return sparrow::next(value_cbegin(), this->size());
656 }
657
658 template <std::ranges::sized_range T, class CR, layout_offset OT>
659 template <std::ranges::sized_range U>
661 void variable_size_binary_array_impl<T, CR, OT>::resize_values(size_type new_length, U value)
662 {
663 const size_t new_size = new_length + static_cast<size_t>(this->get_arrow_proxy().offset());
664 auto& buffers = this->get_arrow_proxy().get_array_private_data()->buffers();
665 if (new_length < size())
666 {
667 const auto offset_begin = static_cast<size_t>(*offset(new_length));
668 auto& data_buffer = buffers[DATA_BUFFER_INDEX];
669 data_buffer.resize(offset_begin);
670 auto& offset_buffer = buffers[OFFSET_BUFFER_INDEX];
671 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
672 offset_buffer_adaptor.resize(new_size + 1);
673 }
674 else if (new_length > size())
675 {
676 insert_value(value_cend(), value, new_length - size());
677 }
678 }
679
680 template <std::ranges::sized_range T, class CR, layout_offset OT>
681 template <std::ranges::sized_range U>
683 auto
684 variable_size_binary_array_impl<T, CR, OT>::insert_value(const_value_iterator pos, U value, size_type count)
686 {
687 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
688 const OT offset_begin = *offset(idx);
689 const std::vector<uint8_t> casted_value{value.cbegin(), value.cend()};
690 const repeat_view<std::vector<uint8_t>> my_repeat_view{casted_value, count};
691 const auto joined_repeated_value_range = std::ranges::views::join(my_repeat_view);
692 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
693 const auto pos_to_insert = sparrow::next(data_buffer.cbegin(), offset_begin);
694 data_buffer.insert(pos_to_insert, joined_repeated_value_range.begin(), joined_repeated_value_range.end());
695 insert_offset(offsets_cbegin() + idx + 1, static_cast<offset_type>(value.size()), count);
696 return sparrow::next(value_begin(), idx);
697 }
698
699 template <std::ranges::sized_range T, class CR, layout_offset OT>
700 auto variable_size_binary_array_impl<T, CR, OT>::insert_offset(
701 const_offset_iterator pos,
702 offset_type value_size,
703 size_type count
704 ) -> offset_iterator
705 {
706 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
707 const auto idx = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
708 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
709 const offset_type cumulative_size = value_size * static_cast<offset_type>(count);
710 // Adjust offsets for subsequent elements
711 std::for_each(
712 sparrow::next(offset_buffer_adaptor.begin(), idx + 1),
713 offset_buffer_adaptor.end(),
714 [cumulative_size](auto& offset)
715 {
716 offset += cumulative_size;
717 }
718 );
719 offset_buffer_adaptor.insert(sparrow::next(offset_buffer_adaptor.cbegin(), idx + 1), count, 0);
720 // Put the right values in the new offsets
721 for (size_t i = idx + 1; i < idx + 1 + count; ++i)
722 {
723 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + value_size;
724 }
725 return offsets_begin() + idx;
726 }
727
728 template <std::ranges::sized_range T, class CR, layout_offset OT>
729 template <mpl::iterator_of_type<T> InputIt>
730 auto
731 variable_size_binary_array_impl<T, CR, OT>::insert_values(const_value_iterator pos, InputIt first, InputIt last)
733 {
734 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
735 auto data_buffer_adaptor = make_buffer_adaptor<data_value_type>(data_buffer);
736 auto values = std::ranges::subrange(first, last);
737 const size_t cumulative_sizes = std::accumulate(
738 values.begin(),
739 values.end(),
740 size_t(0),
741 [](size_t acc, const T& value)
742 {
743 return acc + value.size();
744 }
745 );
746 data_buffer_adaptor.resize(data_buffer_adaptor.size() + cumulative_sizes);
747 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
748 const OT offset_begin = *offset(idx);
749 auto insert_pos = sparrow::next(data_buffer_adaptor.begin(), offset_begin);
750
751 // Move elements to make space for the new value
752 std::move_backward(
753 insert_pos,
754 sparrow::next(data_buffer_adaptor.end(), -static_cast<difference_type>(cumulative_sizes)),
755 data_buffer_adaptor.end()
756 );
757
758 for (const T& value : values)
759 {
760 std::copy(value.begin(), value.end(), insert_pos);
761 std::advance(insert_pos, value.size());
762 }
763
764 const auto sizes_of_each_value = std::ranges::views::transform(
765 values,
766 [](const T& value) -> offset_type
767 {
768 return static_cast<offset_type>(value.size());
769 }
770 );
771 insert_offsets(offset(idx + 1), sizes_of_each_value.begin(), sizes_of_each_value.end());
772 return sparrow::next(value_begin(), idx);
773 }
774
775 template <std::ranges::sized_range T, class CR, layout_offset OT>
776 template <mpl::iterator_of_type<OT> InputIt>
777 auto variable_size_binary_array_impl<T, CR, OT>::insert_offsets(
779 InputIt first_sizes,
780 InputIt last_sizes
781 ) -> offset_iterator
782 {
783 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
784 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
785 SPARROW_ASSERT_TRUE(first_sizes <= last_sizes);
786 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
787 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
788 const auto idx = std::distance(offsets_cbegin(), pos);
789 const OT cumulative_sizes = std::reduce(first_sizes, last_sizes, OT(0));
790 const auto sizes_count = std::distance(first_sizes, last_sizes);
791 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() + static_cast<size_t>(sizes_count));
792 // Move the offsets to make space for the new offsets
793 std::move_backward(
794 offset_buffer_adaptor.begin() + idx,
795 offset_buffer_adaptor.end() - sizes_count,
796 offset_buffer_adaptor.end()
797 );
798 // Adjust offsets for subsequent elements
799 std::for_each(
800 offset_buffer_adaptor.begin() + idx + sizes_count,
801 offset_buffer_adaptor.end(),
802 [cumulative_sizes](auto& offset)
803 {
804 offset += cumulative_sizes;
805 }
806 );
807 // Put the right values in the new offsets
808 InputIt it = first_sizes;
809 for (size_t i = static_cast<size_t>(idx + 1); i < static_cast<size_t>(idx + sizes_count + 1); ++i)
810 {
811 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + *it;
812 ++it;
813 }
814 return offset(static_cast<size_t>(idx));
815 }
816
817 template <std::ranges::sized_range T, class CR, layout_offset OT>
818 auto variable_size_binary_array_impl<T, CR, OT>::erase_values(const_value_iterator pos, size_type count)
819 -> value_iterator
820 {
821 SPARROW_ASSERT_TRUE(pos >= value_cbegin());
822 SPARROW_ASSERT_TRUE(pos <= value_cend());
823 const size_t index = static_cast<size_t>(std::distance(value_cbegin(), pos));
824 if (count == 0)
825 {
826 return sparrow::next(value_begin(), index);
827 }
828 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
829 const auto offset_begin = *offset(index);
830 const auto offset_end = *offset(index + count);
831 const size_t difference = static_cast<size_t>(offset_end - offset_begin);
832 // move the values after the erased ones
833 std::move(data_buffer.begin() + offset_end, data_buffer.end(), data_buffer.begin() + offset_begin);
834 data_buffer.resize(data_buffer.size() - difference);
835 // adjust the offsets for the subsequent elements
836 erase_offsets(offset(index), count);
837 return sparrow::next(value_begin(), index);
838 }
839
840 template <std::ranges::sized_range T, class CR, layout_offset OT>
841 auto variable_size_binary_array_impl<T, CR, OT>::erase_offsets(const_offset_iterator pos, size_type count)
842 -> offset_iterator
843 {
844 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
845 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
846 const size_t index = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
847 if (count == 0)
848 {
849 return offset(index);
850 }
851 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
852 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
853 const OT offset_start_value = *offset(index);
854 const OT offset_end_value = *offset(index + count);
855 const OT difference = offset_end_value - offset_start_value;
856 // move the offsets after the erased ones
857 std::move(
858 sparrow::next(offset_buffer_adaptor.begin(), index + count + 1),
859 offset_buffer_adaptor.end(),
860 sparrow::next(offset_buffer_adaptor.begin(), index + 1)
861 );
862 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() - count);
863 // adjust the offsets for the subsequent elements
864 std::for_each(
865 sparrow::next(offset_buffer_adaptor.begin(), index + 1),
866 offset_buffer_adaptor.end(),
867 [difference](OT& offset)
868 {
869 offset -= difference;
870 }
871 );
872 return offset(index);
873 }
874
875}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::iterator_tag iterator_tag
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
Proxy class over ArrowArray and ArrowSchema.
constexpr size_type null_count() const noexcept
bitset_iterator< self_type, true > const_iterator
The nullable class models a value or a reference that can be "null", or missing, like values traditio...
Definition nullable.hpp:280
A view that repeats a value a given number of times.
This buffer class is use as storage buffer for all sparrow arrays.
Definition u8_buffer.hpp:75
static auto offset_from_sizes(SIZES_RANGE &&sizes) -> offset_buffer_type
inner_const_reference value(size_type i) const
Implementation of reference to inner type used for layout L.
Iterator over the data values of a variable size binary layout.
Concept for iterator types.
Matches range types From whose elements are convertible to elements of range type To.
Definition mp_utils.hpp:450
#define SPARROW_ASSERT_TRUE(expr__)
sparrow::u8_buffer< OFFSET_TYPE > offset_buffer_from_sizes(SIZES_RANGE &&sizes)
constexpr std::size_t size(typelist< T... >={})
Definition mp_utils.hpp:107
constexpr bool excludes_copy_and_move_ctor_v
Definition mp_utils.hpp:507
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient typedef to be used as a crtp base class for arrays using a mutable validity buffer.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_big_string_array_v
Checks whether T is a big_string_array type.
constexpr bool is_string_array_v
Checks whether T is a string_array type.
variable_size_binary_array_impl< binary_traits::value_type, binary_traits::const_reference, std::int32_t > binary_array
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
variable_size_binary_array_impl< std::string, std::string_view, std::int32_t > string_array
constexpr bool is_big_binary_array_v
Checks whether T is a big_binary_array type.
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
constexpr bool is_binary_array_v
Checks whether T is a binary_array type.
auto make_buffer_adaptor(FromBufferRef &buf)
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
variable_size_binary_array_impl< std::string, std::string_view, std::int64_t > big_string_array
variable_size_binary_array_impl< binary_traits::value_type, binary_traits::const_reference, std::int64_t > big_binary_array
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
arrow_traits< std::vector< byte_t > > binary_traits
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
variable_size_binary_value_iterator< array_type, iterator_types > value_iterator
variable_size_binary_value_iterator< array_type, const_iterator_types > const_value_iterator
Base class for array_inner_types specialization.
Traits class that must be specialized by array classes inheriting from array_crtp_base.
Provides compile-time information about Arrow data types.