sparrow 2.0.0
C++20 idiomatic APIs for the Apache Arrow Columnar Format
Loading...
Searching...
No Matches
variable_size_binary_view_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstddef>
18#include <cstring>
19#include <ranges>
20#include <unordered_map>
21
30#include "sparrow/u8_buffer.hpp"
39
40namespace sparrow
41{
42 template <std::ranges::sized_range T, class CR, typename Ext = empty_extension>
44
55
65 arrow_traits<std::vector<byte_t>>::const_reference>;
66
67 namespace detail
68 {
69 template <>
71 {
72 [[nodiscard]] static constexpr sparrow::data_type get()
73 {
75 }
76 };
77
78 template <>
80 {
81 [[nodiscard]] static constexpr sparrow::data_type get()
82 {
84 }
85 };
86 }
87
88 template <std::ranges::sized_range T, class CR, typename Ext>
101
102 template <class T>
104 {
105 };
106
107 template <std::ranges::sized_range T, class CR, typename Ext>
109 : std::true_type
110 {
111 };
112
116 template <class T>
118
158 template <std::ranges::sized_range T, class CR, typename Ext>
160 : public mutable_array_bitmap_base<variable_size_binary_view_array_impl<T, CR, Ext>>,
161 public Ext
162 {
163 public:
164
167
169 using inner_value_type = typename inner_types::inner_value_type;
170 using inner_reference = typename inner_types::inner_reference;
171 using inner_const_reference = typename inner_types::inner_const_reference;
172
174 using bitmap_reference = typename base_type::bitmap_reference;
178 using bitmap_range = typename base_type::bitmap_range;
180
184
188
189 using value_iterator = typename base_type::value_iterator;
190 using const_value_iterator = typename base_type::const_value_iterator;
191
192 using iterator = typename base_type::iterator;
193 using const_iterator = typename base_type::const_iterator;
194
207
222 template <class... Args>
225 : variable_size_binary_view_array_impl(create_proxy(std::forward<Args>(args)...))
226 {
227 }
228
229 private:
230
239 struct buffers_collection
240 {
241 buffer<uint8_t> length_buffer;
242 buffer<uint8_t> long_string_storage;
243 u8_buffer<int64_t> buffer_sizes;
244 };
245
251 [[nodiscard]] static constexpr std::string_view get_arrow_format()
252 {
253 return std::is_same_v<T, arrow_traits<std::string>::value_type> ? std::string_view("vu")
254 : std::string_view("vz");
255 }
256
266 template <input_metadata_container METADATA_RANGE>
267 [[nodiscard]] static ArrowSchema create_arrow_schema(
268 std::optional<std::string_view> name,
269 std::optional<METADATA_RANGE> metadata,
270 std::optional<std::unordered_set<sparrow::ArrowFlag>> flags
271 )
272 {
273 constexpr repeat_view<bool> children_ownership(true, 0);
274 return make_arrow_schema(
275 get_arrow_format(),
276 std::move(name),
277 std::move(metadata),
278 flags,
279 nullptr, // children
280 children_ownership,
281 nullptr, // dictionary
282 true
283 );
284 }
285
304 template <std::ranges::input_range R>
305 requires std::convertible_to<std::ranges::range_value_t<R>, T>
306 static buffers_collection create_buffers(R&& range);
307
327 template <
328 std::ranges::input_range R,
329 validity_bitmap_input VB = validity_bitmap,
330 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
331 requires std::convertible_to<std::ranges::range_value_t<R>, T>
332 [[nodiscard]] static arrow_proxy create_proxy(
333 R&& range,
335 std::optional<std::string_view> name = std::nullopt,
336 std::optional<METADATA_RANGE> metadata = std::nullopt
337 );
338
356 template <std::ranges::input_range NULLABLE_RANGE, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
357 requires std::convertible_to<std::ranges::range_value_t<NULLABLE_RANGE>, nullable<T>>
358 [[nodiscard]] static arrow_proxy create_proxy(
359 NULLABLE_RANGE&& nullable_range,
360 std::optional<std::string_view> name = std::nullopt,
361 std::optional<METADATA_RANGE> metadata = std::nullopt
362 );
363
380 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
381 requires std::convertible_to<std::ranges::range_value_t<R>, T>
382 [[nodiscard]] static arrow_proxy create_proxy(
383 R&& range,
384 bool = true,
385 std::optional<std::string_view> name = std::nullopt,
386 std::optional<METADATA_RANGE> metadata = std::nullopt
387 );
388
408 template <
409 std::ranges::input_range VALUE_BUFFERS_RANGE,
410 validity_bitmap_input VB,
411 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
412 requires std::convertible_to<std::ranges::range_value_t<VALUE_BUFFERS_RANGE>, u8_buffer<uint8_t>>
413 [[nodiscard]] static arrow_proxy create_proxy(
414 size_t element_count,
415 u8_buffer<uint8_t>&& buffer_view,
416 VALUE_BUFFERS_RANGE&& value_buffers,
417 VB&& validity_input,
418 std::optional<std::string_view> name = std::nullopt,
419 std::optional<METADATA_RANGE> metadata = std::nullopt
420 );
421
433 [[nodiscard]] constexpr inner_reference value(size_type i);
434
449 [[nodiscard]] constexpr inner_const_reference value(size_type i) const;
450
470 template <std::ranges::sized_range U>
471 requires mpl::convertible_ranges<U, T>
472 constexpr void assign(U&& rhs, size_type index);
473
474 // Modifiers
475
492 template <std::ranges::sized_range U>
493 requires mpl::convertible_ranges<U, T>
494 void resize_values(size_type new_length, U value);
495
514 template <std::ranges::sized_range U>
515 requires mpl::convertible_ranges<U, T>
516 value_iterator insert_value(const_value_iterator pos, U value, size_type count);
517
537 template <mpl::iterator_of_type<T> InputIt>
538 value_iterator insert_values(const_value_iterator pos, InputIt first, InputIt last);
539
556 value_iterator erase_values(const_value_iterator pos, size_type count);
557
565 [[nodiscard]] constexpr value_iterator value_begin();
566
574 [[nodiscard]] constexpr value_iterator value_end();
575
583 [[nodiscard]] constexpr const_value_iterator value_cbegin() const;
584
592 [[nodiscard]] constexpr const_value_iterator value_cend() const;
593
594 static constexpr size_type LENGTH_BUFFER_INDEX = 1;
595 static constexpr std::size_t DATA_BUFFER_SIZE = 16;
596 static constexpr std::size_t SHORT_STRING_SIZE = 12;
597 static constexpr std::size_t PREFIX_SIZE = 4;
598 static constexpr std::ptrdiff_t PREFIX_OFFSET = 4;
599 static constexpr std::ptrdiff_t SHORT_STRING_OFFSET = 4;
600 static constexpr std::ptrdiff_t BUFFER_INDEX_OFFSET = 8;
601 static constexpr std::ptrdiff_t BUFFER_OFFSET_OFFSET = 12;
602 static constexpr std::size_t FIRST_VAR_DATA_BUFFER_INDEX = 2;
603
604 friend base_type;
609 };
610
611 template <std::ranges::sized_range T, class CR, typename Ext>
616
617 namespace
618 {
619 // Utility functions for safe unaligned access to int32 values
620 inline std::int32_t read_int32_unaligned(const std::uint8_t* ptr)
621 {
622 std::int32_t value;
623 std::memcpy(&value, ptr, sizeof(std::int32_t));
624 return value;
625 }
626
627 inline void write_int32_unaligned(std::uint8_t* ptr, std::int32_t value)
628 {
629 std::memcpy(ptr, &value, sizeof(std::int32_t));
630 }
631
632 // Generic transformation function for type conversions
633 template <typename To, typename From>
634 inline constexpr To transform_to(const From& v)
635 {
636 return static_cast<To>(v);
637 }
638
639 // Helper function to update buffer offsets for long strings after a specific offset
640 template <typename SizeType>
641 inline void update_buffer_offsets_after(
642 std::uint8_t* view_data,
643 SizeType array_size,
644 std::size_t data_buffer_size,
645 std::size_t short_string_size,
646 std::ptrdiff_t buffer_offset_offset,
647 std::size_t threshold_offset,
648 std::ptrdiff_t offset_adjustment,
649 SizeType skip_index = static_cast<SizeType>(-1)
650 )
651 {
652 for (SizeType i = 0; i < array_size; ++i)
653 {
654 if (i == skip_index)
655 {
656 continue;
657 }
658
659 auto* view_ptr = view_data + (i * data_buffer_size);
660 std::int32_t length;
661 std::memcpy(&length, view_ptr, sizeof(std::int32_t));
662
663 if (static_cast<std::size_t>(length) > short_string_size)
664 {
665 std::int32_t current_offset;
666 std::memcpy(&current_offset, view_ptr + buffer_offset_offset, sizeof(std::int32_t));
667
668 if (static_cast<std::size_t>(current_offset) > threshold_offset)
669 {
670 current_offset += static_cast<std::int32_t>(offset_adjustment);
671 std::memcpy(view_ptr + buffer_offset_offset, &current_offset, sizeof(std::int32_t));
672 }
673 }
674 }
675 }
676
677 // Helper function to update buffer sizes metadata
678 template <typename Buffer>
679 inline void update_buffer_sizes_metadata(Buffer& buffer_sizes_buffer, std::int64_t new_size)
680 {
681 auto buffer_sizes_ptr = buffer_sizes_buffer.template data<std::int64_t>();
682 *buffer_sizes_ptr = new_size;
683 }
684 }
685
686 template <std::ranges::sized_range T, class CR, typename Ext>
687 template <std::ranges::input_range R>
688 requires std::convertible_to<std::ranges::range_value_t<R>, T>
689 auto variable_size_binary_view_array_impl<T, CR, Ext>::create_buffers(R&& range) -> buffers_collection
690 {
691#ifdef __GNUC__
692# pragma GCC diagnostic push
693# pragma GCC diagnostic ignored "-Wcast-align"
694#endif
695
696 const auto size = range_size(range);
697 buffer<uint8_t> length_buffer(size * DATA_BUFFER_SIZE, typename buffer<uint8_t>::default_allocator());
698
699 std::size_t long_string_storage_size = 0;
700 std::size_t i = 0;
701 for (auto&& val : range)
702 {
703 auto val_casted = val
704 | std::ranges::views::transform(transform_to<std::uint8_t, typename T::value_type>);
705
706 const auto length = val.size();
707 auto length_ptr = length_buffer.data() + (i * DATA_BUFFER_SIZE);
708
709 // write length
710 write_int32_unaligned(length_ptr, static_cast<std::int32_t>(length));
711
712 if (length <= SHORT_STRING_SIZE)
713 {
714 // write data itself
715 sparrow::ranges::copy(val_casted, length_ptr + SHORT_STRING_OFFSET);
716 std::fill(
717 length_ptr + SHORT_STRING_OFFSET + length,
718 length_ptr + DATA_BUFFER_SIZE,
719 std::uint8_t(0)
720 );
721 }
722 else
723 {
724 // write the prefix of the data
725 auto prefix_sub_range = val_casted | std::ranges::views::take(PREFIX_SIZE);
726 sparrow::ranges::copy(prefix_sub_range, length_ptr + PREFIX_OFFSET);
727
728 // write the buffer index
729 write_int32_unaligned(length_ptr + BUFFER_INDEX_OFFSET, 0);
730
731 // write the buffer offset
732 write_int32_unaligned(
733 length_ptr + BUFFER_OFFSET_OFFSET,
734 static_cast<std::int32_t>(long_string_storage_size)
735 );
736
737 // count the size of the long string storage
738 long_string_storage_size += length;
739 }
740 ++i;
741 }
742
743 // write the long string storage
744 buffer<uint8_t> long_string_storage(long_string_storage_size, buffer<uint8_t>::default_allocator());
745 std::size_t long_string_storage_offset = 0;
746 for (auto&& val : range)
747 {
748 const auto length = val.size();
749 if (length > SHORT_STRING_SIZE)
750 {
751 auto val_casted = val
752 | std::ranges::views::transform(
753 transform_to<std::uint8_t, typename T::value_type>
754 );
755 sparrow::ranges::copy(val_casted, long_string_storage.data() + long_string_storage_offset);
756 long_string_storage_offset += length;
757 }
758 }
759
760 // For binary or utf-8 view arrays, an extra buffer is appended which stores
761 // the lengths of each variadic data buffer as int64_t.
762 // This buffer is necessary since these buffer lengths are not trivially
763 // extractable from other data in an array of binary or utf-8 view type.
764 u8_buffer<int64_t> buffer_sizes(
765 static_cast<std::size_t>(1),
766 static_cast<int64_t>(long_string_storage_size)
767 );
768
769 return {std::move(length_buffer), std::move(long_string_storage), std::move(buffer_sizes)};
770
771#ifdef __GNUC__
772# pragma GCC diagnostic pop
773#endif
774 }
775
776 template <std::ranges::sized_range T, class CR, typename Ext>
777 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
778 requires std::convertible_to<std::ranges::range_value_t<R>, T>
779 arrow_proxy variable_size_binary_view_array_impl<T, CR, Ext>::create_proxy(
780 R&& range,
781 VB&& validity_input,
782 std::optional<std::string_view> name,
783 std::optional<METADATA_RANGE> metadata
784 )
785 {
786 const auto size = range_size(range);
787 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
788 const auto null_count = vbitmap.null_count();
789
790 static const std::optional<std::unordered_set<sparrow::ArrowFlag>> flags{{ArrowFlag::NULLABLE}};
791
792 // create arrow schema
793 ArrowSchema schema = create_arrow_schema(std::move(name), std::move(metadata), flags);
794
795 // create buffers
796 auto buffers_parts = create_buffers(std::forward<R>(range));
797
798 std::vector<buffer<uint8_t>> buffers{
799 std::move(vbitmap).extract_storage(),
800 std::move(buffers_parts.length_buffer),
801 std::move(buffers_parts.long_string_storage),
802 std::move(buffers_parts.buffer_sizes).extract_storage()
803 };
804
805 constexpr repeat_view<bool> children_ownership(true, 0);
806
807 // create arrow array
808 ArrowArray arr = make_arrow_array(
809 static_cast<std::int64_t>(size), // length
810 static_cast<int64_t>(null_count),
811 0, // offset
812 std::move(buffers),
813 nullptr, // children
815 nullptr, // dictionary
816 true
817 );
818
819 arrow_proxy proxy{std::move(arr), std::move(schema)};
820 Ext::init(proxy);
821 return proxy;
822 }
823
824 template <std::ranges::sized_range T, class CR, typename Ext>
825 template <std::ranges::input_range NULLABLE_RANGE, input_metadata_container METADATA_RANGE>
826 requires std::convertible_to<std::ranges::range_value_t<NULLABLE_RANGE>, nullable<T>>
827 [[nodiscard]] arrow_proxy variable_size_binary_view_array_impl<T, CR, Ext>::create_proxy(
828 NULLABLE_RANGE&& nullable_range,
829 std::optional<std::string_view> name,
830 std::optional<METADATA_RANGE> metadata
831 )
832 {
833 auto values = nullable_range
834 | std::views::transform(
835 [](const auto& v)
836 {
837 return static_cast<T>(v.value());
838 }
839 );
840
841 auto is_non_null = nullable_range
842 | std::views::transform(
843 [](const auto& v)
844 {
845 return v.has_value();
846 }
847 );
848
849 return create_proxy(
850 std::forward<decltype(values)>(values),
851 std::forward<decltype(is_non_null)>(is_non_null),
852 name,
853 metadata
854 );
855 }
856
857 template <std::ranges::sized_range T, class CR, typename Ext>
858 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
859 requires std::convertible_to<std::ranges::range_value_t<R>, T>
860 [[nodiscard]] arrow_proxy variable_size_binary_view_array_impl<T, CR, Ext>::create_proxy(
861 R&& range,
862 bool nullable,
863 std::optional<std::string_view> name,
864 std::optional<METADATA_RANGE> metadata
865 )
866 {
867 if (nullable)
868 {
869 return create_proxy(
870 std::forward<R>(range),
872 std::move(name),
873 std::move(metadata)
874 );
875 }
876
877 // create arrow schema
878 ArrowSchema schema = create_arrow_schema(std::move(name), std::move(metadata), std::nullopt);
879
880 // create buffers
881 auto buffers_parts = create_buffers(std::forward<R>(range));
882
883 std::vector<buffer<uint8_t>> buffers{
884 buffer<uint8_t>{nullptr, 0, buffer<uint8_t>::default_allocator()}, // validity bitmap
885 std::move(buffers_parts.length_buffer),
886 std::move(buffers_parts.long_string_storage),
887 std::move(buffers_parts.buffer_sizes).extract_storage()
888 };
889 const auto size = range_size(range);
890
891 constexpr repeat_view<bool> children_ownership(true, 0);
892
893 // create arrow array
894 ArrowArray arr = make_arrow_array(
895 static_cast<std::int64_t>(size), // length
896 static_cast<int64_t>(0),
897 0, // offset
898 std::move(buffers),
899 nullptr, // children
901 nullptr, // dictionary
902 true
903 );
904
905 arrow_proxy proxy{std::move(arr), std::move(schema)};
906 Ext::init(proxy);
907 return proxy;
908 }
909
910 template <std::ranges::sized_range T, class CR, typename Ext>
911 template <std::ranges::input_range VALUE_BUFFERS_RANGE, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
912 requires std::convertible_to<std::ranges::range_value_t<VALUE_BUFFERS_RANGE>, u8_buffer<uint8_t>>
913 arrow_proxy variable_size_binary_view_array_impl<T, CR, Ext>::create_proxy(
914 size_t element_count,
916 VALUE_BUFFERS_RANGE&& value_buffers,
917 VB&& validity_input,
918 std::optional<std::string_view> name,
919 std::optional<METADATA_RANGE> metadata
920 )
921 {
922 const auto size = buffer_view.size() / DATA_BUFFER_SIZE;
923 SPARROW_ASSERT_TRUE(size == element_count);
924
925 static const std::optional<std::unordered_set<sparrow::ArrowFlag>> flags{{ArrowFlag::NULLABLE}};
926
927 ArrowSchema schema = create_arrow_schema(std::move(name), std::move(metadata), flags);
928
929 auto bitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
930 std::vector<buffer<uint8_t>> buffers{
931 std::move(bitmap).extract_storage(),
932 std::move(buffer_view).extract_storage()
933 };
934 for (auto&& buf : value_buffers)
935 {
936 buffers.emplace_back(std::forward<decltype(buf)>(buf), typename buffer<uint8_t>::default_allocator());
937 }
938
939 // Create buffer sizes for the variadic buffers
940 u8_buffer<int64_t> buffer_sizes(value_buffers.size());
941 for (std::size_t i = 0; i < value_buffers.size(); ++i)
942 {
943 buffer_sizes[i] = static_cast<int64_t>(value_buffers[i].size());
944 }
945 buffers.push_back(std::move(buffer_sizes).extract_storage());
946
947 constexpr repeat_view<bool> children_ownership(true, 0);
948
949 ArrowArray arr = make_arrow_array(
950 static_cast<std::int64_t>(size), // length
951 static_cast<std::int64_t>(bitmap.null_count()), // null_count
952 0, // offset
953 std::move(buffers),
954 nullptr, // children
956 nullptr, // dictionary
957 true
958 );
959
960 arrow_proxy proxy{std::move(arr), std::move(schema)};
961 Ext::init(proxy);
962 return proxy;
963 }
964
965 template <std::ranges::sized_range T, class CR, typename Ext>
966 constexpr auto variable_size_binary_view_array_impl<T, CR, Ext>::value(size_type i) -> inner_reference
967 {
968 return static_cast<const self_type*>(this)->value(i);
969 }
970
971 template <std::ranges::sized_range T, class CR, typename Ext>
972 constexpr auto variable_size_binary_view_array_impl<T, CR, Ext>::value(size_type i) const
973 -> inner_const_reference
974 {
975#ifdef __GNUC__
976# pragma GCC diagnostic push
977# pragma GCC diagnostic ignored "-Wcast-align"
978#endif
979
980 SPARROW_ASSERT_TRUE(i < this->size());
981 using char_or_byte = typename inner_const_reference::value_type;
982
983 auto data_ptr = this->get_arrow_proxy().buffers()[LENGTH_BUFFER_INDEX].template data<uint8_t>()
984 + (i * DATA_BUFFER_SIZE);
985 const auto length = static_cast<std::size_t>(read_int32_unaligned(data_ptr));
986
987 if (length <= SHORT_STRING_SIZE)
988 {
989 const auto ptr = reinterpret_cast<const char_or_byte*>(data_ptr);
990 const auto ret = inner_const_reference(ptr + SHORT_STRING_OFFSET, length);
991 return ret;
992 }
993 else
994 {
995 const auto buffer_index = static_cast<std::size_t>(
996 read_int32_unaligned(data_ptr + BUFFER_INDEX_OFFSET)
997 );
998 const auto buffer_offset = static_cast<std::size_t>(
999 read_int32_unaligned(data_ptr + BUFFER_OFFSET_OFFSET)
1000 );
1001 const auto buffer = this->get_arrow_proxy()
1002 .buffers()[buffer_index + FIRST_VAR_DATA_BUFFER_INDEX]
1003 .template data<const char_or_byte>();
1004 return inner_const_reference(buffer + buffer_offset, length);
1005 }
1006
1007#ifdef __GNUC__
1008# pragma GCC diagnostic pop
1009#endif
1010 }
1011
1012 template <std::ranges::sized_range T, class CR, typename Ext>
1013 constexpr auto variable_size_binary_view_array_impl<T, CR, Ext>::value_begin() -> value_iterator
1014 {
1016 }
1017
1018 template <std::ranges::sized_range T, class CR, typename Ext>
1019 constexpr auto variable_size_binary_view_array_impl<T, CR, Ext>::value_end() -> value_iterator
1020 {
1021 return value_iterator(detail::layout_value_functor<self_type, inner_reference>(this), this->size());
1022 }
1023
1024 template <std::ranges::sized_range T, class CR, typename Ext>
1025 constexpr auto variable_size_binary_view_array_impl<T, CR, Ext>::value_cbegin() const
1026 -> const_value_iterator
1027 {
1029 }
1030
1031 template <std::ranges::sized_range T, class CR, typename Ext>
1032 constexpr auto variable_size_binary_view_array_impl<T, CR, Ext>::value_cend() const -> const_value_iterator
1033 {
1034 return const_value_iterator(
1036 this->size()
1037 );
1038 }
1039
1040 template <std::ranges::sized_range T, class CR, typename Ext>
1041 template <std::ranges::sized_range U>
1043 constexpr void variable_size_binary_view_array_impl<T, CR, Ext>::assign(U&& rhs, size_type index)
1044 {
1045 SPARROW_ASSERT_TRUE(index < this->size());
1046 const auto new_length = static_cast<std::size_t>(std::ranges::size(rhs));
1047
1048 auto& length_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[LENGTH_BUFFER_INDEX];
1049 auto view_ptr = length_buffer.data() + (index * DATA_BUFFER_SIZE);
1050 const auto current_length = static_cast<std::size_t>(read_int32_unaligned(view_ptr));
1051
1052 // Update the length in the view structure
1053 write_int32_unaligned(view_ptr, static_cast<std::int32_t>(new_length));
1054
1055 if (new_length <= SHORT_STRING_SIZE)
1056 {
1057 auto data_ptr = view_ptr + SHORT_STRING_OFFSET;
1058 std::ranges::copy(rhs, reinterpret_cast<typename T::value_type*>(data_ptr));
1059
1060 // Clear any remaining bytes in the inline storage
1061 if (new_length < SHORT_STRING_SIZE)
1062 {
1063 std::fill_n(
1064 reinterpret_cast<typename T::value_type*>(data_ptr) + new_length,
1065 SHORT_STRING_SIZE - new_length,
1066 typename T::value_type{}
1067 );
1068 }
1069 }
1070 else
1071 {
1072 // Handle assignment of long strings (> 12 bytes)
1073 // This requires managing the variadic buffers and potentially reorganizing the layout
1074
1075 auto& buffers = this->get_arrow_proxy().get_array_private_data()->buffers();
1076 auto& var_data_buffer = buffers[FIRST_VAR_DATA_BUFFER_INDEX];
1077 auto& buffer_sizes_buffer = buffers[buffers.size() - 1]; // Last buffer contains sizes
1078
1079 const bool was_long_string = current_length > SHORT_STRING_SIZE;
1080 std::size_t current_buffer_offset = 0;
1081
1082 if (was_long_string)
1083 {
1084 current_buffer_offset = static_cast<std::size_t>(
1085 read_int32_unaligned(view_ptr + BUFFER_OFFSET_OFFSET)
1086 );
1087 }
1088
1089 auto transformed_data = rhs
1090 | std::ranges::views::transform(
1091 transform_to<typename T::value_type, typename T::value_type>
1092 );
1093
1094 // Check for memory reuse optimization: if the new value is identical to existing data
1095 bool can_reuse_memory = false;
1096 if (was_long_string && new_length == current_length)
1097 {
1098 const auto* existing_data = var_data_buffer.data() + current_buffer_offset;
1099 can_reuse_memory = std::ranges::equal(
1100 transformed_data,
1101 std::span<const typename T::value_type>(
1102 reinterpret_cast<const typename T::value_type*>(existing_data),
1103 new_length
1104 )
1105 );
1106 }
1107
1108 if (can_reuse_memory)
1109 {
1110 // Data is identical - just update the view structure prefix and we're done
1111 auto prefix_range = rhs | std::ranges::views::take(PREFIX_SIZE);
1112 auto prefix_transformed = prefix_range
1113 | std::ranges::views::transform(
1114 transform_to<std::uint8_t, typename T::value_type>
1115 );
1116 std::ranges::copy(prefix_transformed, view_ptr + PREFIX_OFFSET);
1117 return; // Early exit - no buffer management needed
1118 }
1119
1120 // Calculate space requirements and buffer management strategy
1121 const auto length_diff = static_cast<std::ptrdiff_t>(new_length)
1122 - static_cast<std::ptrdiff_t>(current_length);
1123 const bool can_fit_in_place = was_long_string && length_diff <= 0;
1124
1125 std::size_t final_offset = 0;
1126
1127 if (can_fit_in_place)
1128 {
1129 // We can reuse the existing space (new data is same size or smaller)
1130 final_offset = current_buffer_offset;
1131
1132 // If the new data is smaller, we need to compact the buffer
1133 if (length_diff < 0)
1134 {
1135 const auto bytes_to_compact = static_cast<std::size_t>(-length_diff);
1136 const auto move_start = current_buffer_offset + current_length;
1137 const auto move_end = var_data_buffer.size();
1138 const auto bytes_to_move = move_end - move_start;
1139
1140 if (bytes_to_move > 0)
1141 {
1142 std::move(
1143 var_data_buffer.data() + move_start,
1144 var_data_buffer.data() + move_end,
1145 var_data_buffer.data() + move_start - bytes_to_compact
1146 );
1147 }
1148
1149 var_data_buffer.resize(var_data_buffer.size() - bytes_to_compact);
1150
1151 // Update buffer offsets for all elements that come after this one
1152 update_buffer_offsets_after(
1153 length_buffer.data(),
1154 this->size(),
1155 DATA_BUFFER_SIZE,
1156 SHORT_STRING_SIZE,
1157 BUFFER_OFFSET_OFFSET,
1158 current_buffer_offset + current_length,
1159 -static_cast<std::ptrdiff_t>(bytes_to_compact),
1160 index
1161 );
1162
1163 // Update buffer sizes metadata
1164 update_buffer_sizes_metadata(
1165 buffer_sizes_buffer,
1166 static_cast<std::int64_t>(var_data_buffer.size())
1167 );
1168 }
1169 }
1170 else
1171 {
1172 // Need to expand buffer or assign to new location
1173 const auto expansion_needed = was_long_string ? length_diff
1174 : static_cast<std::ptrdiff_t>(new_length);
1175 const auto new_var_buffer_size = var_data_buffer.size() + expansion_needed;
1176
1177 if (was_long_string && length_diff > 0)
1178 {
1179 // Expand in-place: move data after current element to make space
1180 final_offset = current_buffer_offset;
1181 const auto expansion_bytes = static_cast<std::size_t>(length_diff);
1182 const auto move_start = current_buffer_offset + current_length;
1183 const auto bytes_to_move = var_data_buffer.size() - move_start;
1184
1185 // Resize buffer first
1186 var_data_buffer.resize(new_var_buffer_size);
1187
1188 if (bytes_to_move > 0)
1189 {
1190 // Move data to make space for expansion
1191 std::move_backward(
1192 var_data_buffer.data() + move_start,
1193 var_data_buffer.data() + move_start + bytes_to_move,
1194 var_data_buffer.data() + move_start + bytes_to_move + expansion_bytes
1195 );
1196 }
1197
1198 // Update buffer offsets for all elements that come after this one
1199 update_buffer_offsets_after(
1200 length_buffer.data(),
1201 this->size(),
1202 DATA_BUFFER_SIZE,
1203 SHORT_STRING_SIZE,
1204 BUFFER_OFFSET_OFFSET,
1205 move_start - 1, // threshold is just before move_start
1206 static_cast<std::ptrdiff_t>(expansion_bytes),
1207 index
1208 );
1209 }
1210 else
1211 {
1212 // Append to end of buffer (new long string)
1213 final_offset = var_data_buffer.size();
1214 var_data_buffer.resize(new_var_buffer_size);
1215 }
1216
1217 // Update buffer sizes metadata
1218 update_buffer_sizes_metadata(buffer_sizes_buffer, static_cast<std::int64_t>(new_var_buffer_size));
1219 }
1220
1221 std::ranges::copy(transformed_data, var_data_buffer.data() + final_offset);
1222
1223 // Update view structure for long string format
1224 // Write prefix (first 4 bytes)
1225 auto prefix_range = rhs | std::ranges::views::take(PREFIX_SIZE);
1226 auto prefix_transformed = prefix_range
1227 | std::ranges::views::transform(
1228 transform_to<std::uint8_t, typename T::value_type>
1229 );
1230 std::ranges::copy(prefix_transformed, view_ptr + PREFIX_OFFSET);
1231
1232 write_int32_unaligned(
1233 view_ptr + BUFFER_INDEX_OFFSET,
1234 static_cast<std::int32_t>(FIRST_VAR_DATA_BUFFER_INDEX)
1235 );
1236
1237 write_int32_unaligned(view_ptr + BUFFER_OFFSET_OFFSET, static_cast<std::int32_t>(final_offset));
1238 }
1239 }
1240
1241 template <std::ranges::sized_range T, class CR, typename Ext>
1242 template <std::ranges::sized_range U>
1244 void variable_size_binary_view_array_impl<T, CR, Ext>::resize_values(size_type new_length, U value)
1245 {
1246 const size_t current_size = this->size();
1247
1248 if (new_length == current_size)
1249 {
1250 return;
1251 }
1252
1253 if (new_length < current_size)
1254 {
1255 erase_values(sparrow::next(value_cbegin(), new_length), current_size - new_length);
1256 }
1257 else
1258 {
1259 insert_value(value_cend(), value, new_length - current_size);
1260 }
1261 }
1262
1263 template <std::ranges::sized_range T, class CR, typename Ext>
1264 template <std::ranges::sized_range U>
1266 auto
1267 variable_size_binary_view_array_impl<T, CR, Ext>::insert_value(const_value_iterator pos, U value, size_type count)
1269 {
1270 const auto repeat_view = sparrow::repeat_view<U>(value, count);
1271 return insert_values(pos, std::ranges::begin(repeat_view), std::ranges::end(repeat_view));
1272 }
1273
1274 template <std::ranges::sized_range T, class CR, typename Ext>
1275 template <mpl::iterator_of_type<T> InputIt>
1276 auto variable_size_binary_view_array_impl<T, CR, Ext>::insert_values(
1278 InputIt first,
1279 InputIt last
1280 ) -> value_iterator
1281 {
1282 SPARROW_ASSERT_TRUE(first <= last);
1283 const size_type count = static_cast<size_type>(std::distance(first, last));
1284 if (count == 0)
1285 {
1286 const auto insert_index = std::distance(value_cbegin(), pos);
1287 return value_begin() + insert_index;
1288 }
1289
1290 const auto insert_index = static_cast<size_t>(std::distance(value_cbegin(), pos));
1291 const auto current_size = this->size();
1292 const auto new_size = current_size + count;
1293
1294 // Calculate total additional variadic storage needed
1295 std::size_t additional_var_storage = 0;
1296 std::vector<std::size_t> value_lengths;
1297 value_lengths.reserve(count);
1298
1299 for (auto it = first; it != last; ++it)
1300 {
1301 const auto length = static_cast<std::size_t>(std::ranges::size(*it));
1302 value_lengths.push_back(length);
1303 if (length > SHORT_STRING_SIZE)
1304 {
1305 additional_var_storage += length;
1306 }
1307 }
1308
1309 auto& proxy = this->get_arrow_proxy();
1310 auto* private_data = proxy.get_array_private_data();
1311 auto& buffers = private_data->buffers();
1312
1313 const auto new_view_buffer_size = new_size * DATA_BUFFER_SIZE;
1314 buffers[LENGTH_BUFFER_INDEX].resize(new_view_buffer_size);
1315
1316 if (additional_var_storage > 0)
1317 {
1318 const auto current_var_size = buffers[FIRST_VAR_DATA_BUFFER_INDEX].size();
1319 buffers[FIRST_VAR_DATA_BUFFER_INDEX].resize(current_var_size + additional_var_storage);
1320 }
1321
1322 auto& buffer_sizes = buffers[buffers.size() - 1];
1323 update_buffer_sizes_metadata(
1324 buffer_sizes,
1325 static_cast<std::int64_t>(buffers[FIRST_VAR_DATA_BUFFER_INDEX].size())
1326 );
1327
1328 // Shift existing view structures after insertion point
1329 auto* view_data = buffers[LENGTH_BUFFER_INDEX].data();
1330 if (insert_index < current_size)
1331 {
1332 const auto bytes_to_move = (current_size - insert_index) * DATA_BUFFER_SIZE;
1333 const auto src_offset = insert_index * DATA_BUFFER_SIZE;
1334 const auto dst_offset = (insert_index + count) * DATA_BUFFER_SIZE;
1335
1336 std::memmove(view_data + dst_offset, view_data + src_offset, bytes_to_move);
1337
1338 // Update buffer offsets for moved long strings
1339 if (additional_var_storage > 0)
1340 {
1341 for (size_type i = insert_index + count; i < new_size; ++i)
1342 {
1343 auto* view_ptr = view_data + (i * DATA_BUFFER_SIZE);
1344 std::int32_t length;
1345 std::memcpy(&length, view_ptr, sizeof(std::int32_t));
1346
1347 if (static_cast<std::size_t>(length) > SHORT_STRING_SIZE)
1348 {
1349 std::int32_t current_offset;
1350 std::memcpy(&current_offset, view_ptr + BUFFER_OFFSET_OFFSET, sizeof(std::int32_t));
1351 current_offset += static_cast<std::int32_t>(additional_var_storage);
1352 std::memcpy(view_ptr + BUFFER_OFFSET_OFFSET, &current_offset, sizeof(std::int32_t));
1353 }
1354 }
1355 }
1356 }
1357
1358 // Insert new view structures
1359 std::size_t var_offset = buffers[FIRST_VAR_DATA_BUFFER_INDEX].size() - additional_var_storage;
1360 size_type value_idx = 0;
1361
1362 for (auto it = first; it != last; ++it, ++value_idx)
1363 {
1364 const auto view_index = insert_index + value_idx;
1365 auto* view_ptr = view_data + (view_index * DATA_BUFFER_SIZE);
1366 const auto value_length = value_lengths[value_idx];
1367
1368 const auto& current_value = *it;
1369
1370 // Write length
1371 const std::int32_t value_length_int32 = static_cast<std::int32_t>(value_length);
1372 std::memcpy(view_ptr, &value_length_int32, sizeof(std::int32_t));
1373
1374 if (value_length <= SHORT_STRING_SIZE)
1375 {
1376 // Store inline - convert and copy elements manually
1377 std::ranges::transform(
1378 current_value,
1379 view_ptr + SHORT_STRING_OFFSET,
1380 transform_to<std::uint8_t, typename T::value_type>
1381 );
1382
1383 std::fill(
1384 view_ptr + SHORT_STRING_OFFSET + value_length,
1385 view_ptr + DATA_BUFFER_SIZE,
1386 std::uint8_t(0)
1387 );
1388 }
1389 else
1390 {
1391 // Store prefix - copy first PREFIX_SIZE elements manually
1392 std::ranges::transform(
1393 current_value | std::views::take(PREFIX_SIZE),
1394 view_ptr + PREFIX_OFFSET,
1395 transform_to<std::uint8_t, typename T::value_type>
1396 );
1397
1398 // Set buffer index
1399 const std::int32_t buffer_index_zero = 0;
1400 std::memcpy(view_ptr + BUFFER_INDEX_OFFSET, &buffer_index_zero, sizeof(std::int32_t));
1401
1402 // Set buffer offset
1403 const std::int32_t var_offset_int32 = static_cast<std::int32_t>(var_offset);
1404 std::memcpy(view_ptr + BUFFER_OFFSET_OFFSET, &var_offset_int32, sizeof(std::int32_t));
1405
1406 // Copy data to variadic buffer - convert and copy manually
1407 std::ranges::transform(
1408 current_value,
1409 buffers[FIRST_VAR_DATA_BUFFER_INDEX].data() + var_offset,
1410 transform_to<std::uint8_t, typename T::value_type>
1411 );
1412
1413 var_offset += value_length;
1414 }
1415 }
1416
1417 // Update buffers
1418 proxy.update_buffers();
1419
1420 return value_begin() + static_cast<difference_type>(insert_index);
1421 }
1422
1423 template <std::ranges::sized_range T, class CR, typename Ext>
1424 auto
1425 variable_size_binary_view_array_impl<T, CR, Ext>::erase_values(const_value_iterator pos, size_type count)
1426 -> value_iterator
1427 {
1428 const size_t erase_index = static_cast<size_t>(std::distance(value_cbegin(), pos));
1429 const size_t current_size = this->size();
1430
1431 // Validate bounds and handle zero count
1432 if (erase_index + count > current_size)
1433 {
1434 count = current_size - erase_index;
1435 }
1436
1437 if (count == 0)
1438 {
1439 return value_begin() + static_cast<difference_type>(erase_index);
1440 }
1441
1442 const auto new_size = current_size - count;
1443
1444 // Calculate how much variadic storage will be freed
1445 std::size_t freed_var_storage = 0;
1446 auto& proxy = this->get_arrow_proxy();
1447 auto* private_data = proxy.get_array_private_data();
1448 auto& buffers = private_data->buffers();
1449 auto* view_data = buffers[LENGTH_BUFFER_INDEX].data();
1450
1451 // Calculate freed storage from elements being erased
1452 for (size_type i = erase_index; i < erase_index + count; ++i)
1453 {
1454 auto* view_ptr = view_data + (i * DATA_BUFFER_SIZE);
1455 std::int32_t length;
1456 std::memcpy(&length, view_ptr, sizeof(std::int32_t));
1457 if (static_cast<std::size_t>(length) > SHORT_STRING_SIZE)
1458 {
1459 freed_var_storage += static_cast<std::size_t>(length);
1460 }
1461 }
1462
1463 // Handle empty array case
1464 if (new_size == 0)
1465 {
1466 // Resize all buffers to empty
1467 if (buffers[0].size() > 0)
1468 {
1469 buffers[0].clear();
1470 }
1471 buffers[LENGTH_BUFFER_INDEX].clear();
1472 buffers[FIRST_VAR_DATA_BUFFER_INDEX].clear();
1473
1474 auto& buffer_sizes = buffers[buffers.size() - 1];
1475 update_buffer_sizes_metadata(buffer_sizes, 0);
1476
1477 proxy.update_buffers();
1478 return value_begin();
1479 }
1480
1481 // Compact variadic buffer if needed
1482 if (freed_var_storage > 0)
1483 {
1484 auto& var_buffer = buffers[FIRST_VAR_DATA_BUFFER_INDEX];
1485 std::size_t write_offset = 0;
1486
1487 // Create mapping of old offsets to new offsets
1488 std::unordered_map<std::size_t, std::size_t> offset_mapping;
1489 offset_mapping.reserve(current_size - count);
1490
1491 for (size_type i = 0; i < current_size; ++i)
1492 {
1493 if (i >= erase_index && i < erase_index + count)
1494 {
1495 // Skip erased elements
1496 continue;
1497 }
1498
1499 auto* view_ptr = view_data + (i * DATA_BUFFER_SIZE);
1500 std::int32_t length;
1501 std::memcpy(&length, view_ptr, sizeof(std::int32_t));
1502 if (static_cast<std::size_t>(length) > SHORT_STRING_SIZE)
1503 {
1504 std::int32_t old_offset_int32;
1505 std::memcpy(&old_offset_int32, view_ptr + BUFFER_OFFSET_OFFSET, sizeof(std::int32_t));
1506 const auto old_offset = static_cast<std::size_t>(old_offset_int32);
1507
1508 // Record mapping for updating view structures later
1509 offset_mapping[old_offset] = write_offset;
1510
1511 // Move data if needed
1512 if (write_offset != old_offset)
1513 {
1514 std::memmove(
1515 var_buffer.data() + write_offset,
1516 var_buffer.data() + old_offset,
1517 static_cast<std::size_t>(length)
1518 );
1519 }
1520
1521 write_offset += static_cast<std::size_t>(length);
1522 }
1523 }
1524
1525 // Resize variadic buffer
1526 var_buffer.resize(var_buffer.size() - freed_var_storage);
1527
1528 // Update buffer sizes metadata
1529 auto& buffer_sizes = buffers[buffers.size() - 1];
1530 update_buffer_sizes_metadata(buffer_sizes, static_cast<std::int64_t>(var_buffer.size()));
1531
1532 // Update view structure offsets
1533 for (size_type i = 0; i < current_size; ++i)
1534 {
1535 if (i >= erase_index && i < erase_index + count)
1536 {
1537 continue; // Skip erased elements
1538 }
1539
1540 auto* view_ptr = view_data + (i * DATA_BUFFER_SIZE);
1541 std::int32_t length;
1542 std::memcpy(&length, view_ptr, sizeof(std::int32_t));
1543 if (static_cast<std::size_t>(length) > SHORT_STRING_SIZE)
1544 {
1545 std::int32_t old_offset_int32;
1546 std::memcpy(&old_offset_int32, view_ptr + BUFFER_OFFSET_OFFSET, sizeof(std::int32_t));
1547 const auto old_offset = static_cast<std::size_t>(old_offset_int32);
1548 auto it = offset_mapping.find(old_offset);
1549 if (it != offset_mapping.end())
1550 {
1551 const std::int32_t new_offset = static_cast<std::int32_t>(it->second);
1552 std::memcpy(view_ptr + BUFFER_OFFSET_OFFSET, &new_offset, sizeof(std::int32_t));
1553 }
1554 }
1555 }
1556 }
1557
1558 // Compact view buffer - move elements after erase range
1559 if (erase_index + count < current_size)
1560 {
1561 const auto src_offset = (erase_index + count) * DATA_BUFFER_SIZE;
1562 const auto dst_offset = erase_index * DATA_BUFFER_SIZE;
1563 const auto bytes_to_move = (current_size - erase_index - count) * DATA_BUFFER_SIZE;
1564
1565 std::memmove(view_data + dst_offset, view_data + src_offset, bytes_to_move);
1566 }
1567
1568 // Resize view buffer
1569 buffers[LENGTH_BUFFER_INDEX].resize(new_size * DATA_BUFFER_SIZE);
1570
1571 // Update buffers
1572 proxy.update_buffers();
1573
1574 // Return iterator to element after last erased, or end if we erased to the end
1575 return erase_index < new_size ? sparrow::next(value_begin(), erase_index) : value_end();
1576 }
1577
1578}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::bitmap_iterator bitmap_iterator
typename base_type::iterator_tag iterator_tag
typename base_type::const_bitmap_iterator const_bitmap_iterator
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
constexpr size_type size() const noexcept
Object that owns a piece of contiguous memory.
Definition buffer.hpp:114
xsimd::aligned_allocator< T > default_allocator
Definition buffer.hpp:126
constexpr U * data() noexcept
Definition buffer.hpp:632
constexpr size_type null_count() const noexcept
Returns the number of bits set to false (null/invalid).
typename storage_type::default_allocator default_allocator
A view that repeats a value a given number of times.
This buffer class is used as storage buffer for all sparrow arrays.
variable_size_binary_view_array_impl(Args &&... args)
Generic constructor for creating variable-size binary view array.
variable_size_binary_view_array_impl(arrow_proxy)
Constructs variable-size binary view array from Arrow proxy.
Concept for convertible range types.
Definition mp_utils.hpp:931
#define SPARROW_ASSERT_TRUE(expr__)
constexpr std::size_t size(typelist< T... >={})
Gets the count of types contained in a typelist.
Definition mp_utils.hpp:216
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
constexpr std::ranges::copy_result< std::ranges::borrowed_iterator_t< R >, O > copy(R &&r, O result)
Definition ranges.hpp:120
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient alias for arrays with mutable validity bitmaps.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_variable_size_binary_view_array
Checks whether T is a variable_size_binary_view_array_impl type.
SPARROW_API std::size_t array_size(const array_wrapper &ar)
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
constexpr std::size_t range_size(R &&r)
Definition ranges.hpp:35
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
variable_size_binary_view_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference > string_view_array
A variable-size string view layout implementation.
dynamic_bitset< std::uint8_t > validity_bitmap
Type alias for a validity bitmap using 8-bit storage blocks.
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
Ensures a validity bitmap of the specified size from various input types.
variable_size_binary_view_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference > binary_view_array
A variable-size binary view layout implementation.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
Extensions to the C++ standard library.
functor_index_iterator< detail::layout_value_functor< const array_type, inner_const_reference > > const_value_iterator
functor_index_iterator< detail::layout_value_functor< array_type, inner_reference > > value_iterator
Base class for array_inner_types specializations.
Traits class that must be specialized by array implementations.
Provides compile-time information about Arrow data types.
Metafunction for retrieving the data_type of a typed array.