sparrow 1.3.0
Loading...
Searching...
No Matches
variable_size_binary_view_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstddef>
18#include <cstring>
19#include <ranges>
20#include <unordered_map>
21
30#include "sparrow/u8_buffer.hpp"
39
40namespace sparrow
41{
42 template <std::ranges::sized_range T, class CR, typename Ext = empty_extension>
44
55
65 arrow_traits<std::vector<byte_t>>::const_reference>;
66
67 namespace detail
68 {
69 template <>
71 {
72 [[nodiscard]] static constexpr sparrow::data_type get()
73 {
75 }
76 };
77
78 template <>
80 {
81 [[nodiscard]] static constexpr sparrow::data_type get()
82 {
84 }
85 };
86 }
87
88 template <std::ranges::sized_range T, class CR, typename Ext>
101
102 template <class T>
104 {
105 };
106
107 template <std::ranges::sized_range T, class CR, typename Ext>
109 : std::true_type
110 {
111 };
112
116 template <class T>
118
158 template <std::ranges::sized_range T, class CR, typename Ext>
160 : public mutable_array_bitmap_base<variable_size_binary_view_array_impl<T, CR, Ext>>,
161 public Ext
162 {
163 public:
164
167
169 using inner_value_type = typename inner_types::inner_value_type;
170 using inner_reference = typename inner_types::inner_reference;
171 using inner_const_reference = typename inner_types::inner_const_reference;
172
174 using bitmap_reference = typename base_type::bitmap_reference;
178 using bitmap_range = typename base_type::bitmap_range;
180
184
188
189 using value_iterator = typename base_type::value_iterator;
190 using const_value_iterator = typename base_type::const_value_iterator;
191
192 using iterator = typename base_type::iterator;
193 using const_iterator = typename base_type::const_iterator;
194
207
222 template <class... Args>
225 : variable_size_binary_view_array_impl(create_proxy(std::forward<Args>(args)...))
226 {
227 }
228
229 private:
230
239 struct buffers_collection
240 {
241 buffer<uint8_t> length_buffer;
242 buffer<uint8_t> long_string_storage;
243 u8_buffer<int64_t> buffer_sizes;
244 };
245
251 [[nodiscard]] static constexpr std::string_view get_arrow_format()
252 {
253 return std::is_same_v<T, arrow_traits<std::string>::value_type> ? std::string_view("vu")
254 : std::string_view("vz");
255 }
256
266 template <input_metadata_container METADATA_RANGE>
267 [[nodiscard]] static ArrowSchema create_arrow_schema(
268 std::optional<std::string_view> name,
269 std::optional<METADATA_RANGE> metadata,
270 std::optional<std::unordered_set<sparrow::ArrowFlag>> flags
271 )
272 {
273 constexpr repeat_view<bool> children_ownership(true, 0);
274 return make_arrow_schema(
275 get_arrow_format(),
276 std::move(name),
277 std::move(metadata),
278 flags,
279 nullptr, // children
280 children_ownership,
281 nullptr, // dictionary
282 true
283 );
284 }
285
304 template <std::ranges::input_range R>
305 requires std::convertible_to<std::ranges::range_value_t<R>, T>
306 static buffers_collection create_buffers(R&& range);
307
327 template <
328 std::ranges::input_range R,
329 validity_bitmap_input VB = validity_bitmap,
330 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
331 requires std::convertible_to<std::ranges::range_value_t<R>, T>
332 [[nodiscard]] static arrow_proxy create_proxy(
333 R&& range,
334 VB&& bitmap_input = validity_bitmap{},
335 std::optional<std::string_view> name = std::nullopt,
336 std::optional<METADATA_RANGE> metadata = std::nullopt
337 );
338
356 template <std::ranges::input_range NULLABLE_RANGE, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
357 requires std::convertible_to<std::ranges::range_value_t<NULLABLE_RANGE>, nullable<T>>
358 [[nodiscard]] static arrow_proxy create_proxy(
359 NULLABLE_RANGE&& nullable_range,
360 std::optional<std::string_view> name = std::nullopt,
361 std::optional<METADATA_RANGE> metadata = std::nullopt
362 );
363
380 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
381 requires std::convertible_to<std::ranges::range_value_t<R>, T>
382 [[nodiscard]] static arrow_proxy create_proxy(
383 R&& range,
384 bool = true,
385 std::optional<std::string_view> name = std::nullopt,
386 std::optional<METADATA_RANGE> metadata = std::nullopt
387 );
388
408 template <
409 std::ranges::input_range VALUE_BUFFERS_RANGE,
410 validity_bitmap_input VB,
411 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
412 requires std::convertible_to<std::ranges::range_value_t<VALUE_BUFFERS_RANGE>, u8_buffer<uint8_t>>
413 [[nodiscard]] static arrow_proxy create_proxy(
414 size_t element_count,
415 u8_buffer<uint8_t>&& buffer_view,
416 VALUE_BUFFERS_RANGE&& value_buffers,
417 VB&& validity_input,
418 std::optional<std::string_view> name = std::nullopt,
419 std::optional<METADATA_RANGE> metadata = std::nullopt
420 );
421
433 [[nodiscard]] constexpr inner_reference value(size_type i);
434
449 [[nodiscard]] constexpr inner_const_reference value(size_type i) const;
450
470 template <std::ranges::sized_range U>
471 requires mpl::convertible_ranges<U, T>
472 constexpr void assign(U&& rhs, size_type index);
473
474 // Modifiers
475
492 template <std::ranges::sized_range U>
493 requires mpl::convertible_ranges<U, T>
494 void resize_values(size_type new_length, U value);
495
514 template <std::ranges::sized_range U>
515 requires mpl::convertible_ranges<U, T>
516 value_iterator insert_value(const_value_iterator pos, U value, size_type count);
517
537 template <mpl::iterator_of_type<T> InputIt>
538 value_iterator insert_values(const_value_iterator pos, InputIt first, InputIt last);
539
556 value_iterator erase_values(const_value_iterator pos, size_type count);
557
565 [[nodiscard]] constexpr value_iterator value_begin();
566
574 [[nodiscard]] constexpr value_iterator value_end();
575
583 [[nodiscard]] constexpr const_value_iterator value_cbegin() const;
584
592 [[nodiscard]] constexpr const_value_iterator value_cend() const;
593
594 static constexpr size_type LENGTH_BUFFER_INDEX = 1;
595 static constexpr std::size_t DATA_BUFFER_SIZE = 16;
596 static constexpr std::size_t SHORT_STRING_SIZE = 12;
597 static constexpr std::size_t PREFIX_SIZE = 4;
598 static constexpr std::ptrdiff_t PREFIX_OFFSET = 4;
599 static constexpr std::ptrdiff_t SHORT_STRING_OFFSET = 4;
600 static constexpr std::ptrdiff_t BUFFER_INDEX_OFFSET = 8;
601 static constexpr std::ptrdiff_t BUFFER_OFFSET_OFFSET = 12;
602 static constexpr std::size_t FIRST_VAR_DATA_BUFFER_INDEX = 2;
603
604 friend base_type;
609 };
610
611 template <std::ranges::sized_range T, class CR, typename Ext>
616
617 namespace
618 {
619 // Utility functions for safe unaligned access to int32 values
620 inline std::int32_t read_int32_unaligned(const std::uint8_t* ptr)
621 {
622 std::int32_t value;
623 std::memcpy(&value, ptr, sizeof(std::int32_t));
624 return value;
625 }
626
627 inline void write_int32_unaligned(std::uint8_t* ptr, std::int32_t value)
628 {
629 std::memcpy(ptr, &value, sizeof(std::int32_t));
630 }
631
632 // Generic transformation function for type conversions
633 template <typename To, typename From>
634 inline constexpr To transform_to(const From& v)
635 {
636 return static_cast<To>(v);
637 }
638
639 // Helper function to update buffer offsets for long strings after a specific offset
640 template <typename SizeType>
641 inline void update_buffer_offsets_after(
642 std::uint8_t* view_data,
643 SizeType array_size,
644 std::size_t data_buffer_size,
645 std::size_t short_string_size,
646 std::ptrdiff_t buffer_offset_offset,
647 std::size_t threshold_offset,
648 std::ptrdiff_t offset_adjustment,
649 SizeType skip_index = static_cast<SizeType>(-1)
650 )
651 {
652 for (SizeType i = 0; i < array_size; ++i)
653 {
654 if (i == skip_index)
655 {
656 continue;
657 }
658
659 auto* view_ptr = view_data + (i * data_buffer_size);
660 std::int32_t length;
661 std::memcpy(&length, view_ptr, sizeof(std::int32_t));
662
663 if (static_cast<std::size_t>(length) > short_string_size)
664 {
665 std::int32_t current_offset;
666 std::memcpy(&current_offset, view_ptr + buffer_offset_offset, sizeof(std::int32_t));
667
668 if (static_cast<std::size_t>(current_offset) > threshold_offset)
669 {
670 current_offset += static_cast<std::int32_t>(offset_adjustment);
671 std::memcpy(view_ptr + buffer_offset_offset, &current_offset, sizeof(std::int32_t));
672 }
673 }
674 }
675 }
676
677 // Helper function to update buffer sizes metadata
678 template <typename Buffer>
679 inline void update_buffer_sizes_metadata(Buffer& buffer_sizes_buffer, std::int64_t new_size)
680 {
681 auto buffer_sizes_ptr = buffer_sizes_buffer.template data<std::int64_t>();
682 *buffer_sizes_ptr = new_size;
683 }
684 }
685
686 template <std::ranges::sized_range T, class CR, typename Ext>
687 template <std::ranges::input_range R>
688 requires std::convertible_to<std::ranges::range_value_t<R>, T>
689 auto variable_size_binary_view_array_impl<T, CR, Ext>::create_buffers(R&& range) -> buffers_collection
690 {
691#ifdef __GNUC__
692# pragma GCC diagnostic push
693# pragma GCC diagnostic ignored "-Wcast-align"
694#endif
695
696 const auto size = range_size(range);
697 buffer<uint8_t> length_buffer(size * DATA_BUFFER_SIZE);
698
699 std::size_t long_string_storage_size = 0;
700 std::size_t i = 0;
701 for (auto&& val : range)
702 {
703 auto val_casted = val
704 | std::ranges::views::transform(transform_to<std::uint8_t, typename T::value_type>);
705
706 const auto length = val.size();
707 auto length_ptr = length_buffer.data() + (i * DATA_BUFFER_SIZE);
708
709 // write length
710 write_int32_unaligned(length_ptr, static_cast<std::int32_t>(length));
711
712 if (length <= SHORT_STRING_SIZE)
713 {
714 // write data itself
715 sparrow::ranges::copy(val_casted, length_ptr + SHORT_STRING_OFFSET);
716 std::fill(
717 length_ptr + SHORT_STRING_OFFSET + length,
718 length_ptr + DATA_BUFFER_SIZE,
719 std::uint8_t(0)
720 );
721 }
722 else
723 {
724 // write the prefix of the data
725 auto prefix_sub_range = val_casted | std::ranges::views::take(PREFIX_SIZE);
726 sparrow::ranges::copy(prefix_sub_range, length_ptr + PREFIX_OFFSET);
727
728 // write the buffer index
729 write_int32_unaligned(length_ptr + BUFFER_INDEX_OFFSET, 0);
730
731 // write the buffer offset
732 write_int32_unaligned(
733 length_ptr + BUFFER_OFFSET_OFFSET,
734 static_cast<std::int32_t>(long_string_storage_size)
735 );
736
737 // count the size of the long string storage
738 long_string_storage_size += length;
739 }
740 ++i;
741 }
742
743 // write the long string storage
744 buffer<uint8_t> long_string_storage(long_string_storage_size);
745 std::size_t long_string_storage_offset = 0;
746 for (auto&& val : range)
747 {
748 const auto length = val.size();
749 if (length > SHORT_STRING_SIZE)
750 {
751 auto val_casted = val
752 | std::ranges::views::transform(
753 transform_to<std::uint8_t, typename T::value_type>
754 );
755 sparrow::ranges::copy(val_casted, long_string_storage.data() + long_string_storage_offset);
756 long_string_storage_offset += length;
757 }
758 }
759
760 // For binary or utf-8 view arrays, an extra buffer is appended which stores
761 // the lengths of each variadic data buffer as int64_t.
762 // This buffer is necessary since these buffer lengths are not trivially
763 // extractable from other data in an array of binary or utf-8 view type.
764 u8_buffer<int64_t> buffer_sizes(
765 static_cast<std::size_t>(1),
766 static_cast<int64_t>(long_string_storage_size)
767 );
768
769 return {std::move(length_buffer), std::move(long_string_storage), std::move(buffer_sizes)};
770
771#ifdef __GNUC__
772# pragma GCC diagnostic pop
773#endif
774 }
775
776 template <std::ranges::sized_range T, class CR, typename Ext>
777 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
778 requires std::convertible_to<std::ranges::range_value_t<R>, T>
779 arrow_proxy variable_size_binary_view_array_impl<T, CR, Ext>::create_proxy(
780 R&& range,
781 VB&& validity_input,
782 std::optional<std::string_view> name,
783 std::optional<METADATA_RANGE> metadata
784 )
785 {
786 const auto size = range_size(range);
787 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
788 const auto null_count = vbitmap.null_count();
789
790 static const std::optional<std::unordered_set<sparrow::ArrowFlag>> flags{{ArrowFlag::NULLABLE}};
791
792 // create arrow schema
793 ArrowSchema schema = create_arrow_schema(std::move(name), std::move(metadata), flags);
794
795 // create buffers
796 auto buffers_parts = create_buffers(std::forward<R>(range));
797
798 std::vector<buffer<uint8_t>> buffers{
799 std::move(vbitmap).extract_storage(),
800 std::move(buffers_parts.length_buffer),
801 std::move(buffers_parts.long_string_storage),
802 std::move(buffers_parts.buffer_sizes).extract_storage()
803 };
804
805 constexpr repeat_view<bool> children_ownership(true, 0);
806
807 // create arrow array
808 ArrowArray arr = make_arrow_array(
809 static_cast<std::int64_t>(size), // length
810 static_cast<int64_t>(null_count),
811 0, // offset
812 std::move(buffers),
813 nullptr, // children
815 nullptr, // dictionary
816 true
817 );
818
819 arrow_proxy proxy{std::move(arr), std::move(schema)};
820 Ext::init(proxy);
821 return proxy;
822 }
823
824 template <std::ranges::sized_range T, class CR, typename Ext>
825 template <std::ranges::input_range NULLABLE_RANGE, input_metadata_container METADATA_RANGE>
826 requires std::convertible_to<std::ranges::range_value_t<NULLABLE_RANGE>, nullable<T>>
827 [[nodiscard]] arrow_proxy variable_size_binary_view_array_impl<T, CR, Ext>::create_proxy(
828 NULLABLE_RANGE&& nullable_range,
829 std::optional<std::string_view> name,
830 std::optional<METADATA_RANGE> metadata
831 )
832 {
833 auto values = nullable_range
834 | std::views::transform(
835 [](const auto& v)
836 {
837 return static_cast<T>(v.value());
838 }
839 );
840
841 auto is_non_null = nullable_range
842 | std::views::transform(
843 [](const auto& v)
844 {
845 return v.has_value();
846 }
847 );
848
849 return create_proxy(
850 std::forward<decltype(values)>(values),
851 std::forward<decltype(is_non_null)>(is_non_null),
852 name,
853 metadata
854 );
855 }
856
857 template <std::ranges::sized_range T, class CR, typename Ext>
858 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
859 requires std::convertible_to<std::ranges::range_value_t<R>, T>
860 [[nodiscard]] arrow_proxy variable_size_binary_view_array_impl<T, CR, Ext>::create_proxy(
861 R&& range,
862 bool nullable,
863 std::optional<std::string_view> name,
864 std::optional<METADATA_RANGE> metadata
865 )
866 {
867 if (nullable)
868 {
869 return create_proxy(std::forward<R>(range), validity_bitmap{}, std::move(name), std::move(metadata));
870 }
871
872 // create arrow schema
873 ArrowSchema schema = create_arrow_schema(std::move(name), std::move(metadata), std::nullopt);
874
875 // create buffers
876 auto buffers_parts = create_buffers(std::forward<R>(range));
877
878 std::vector<buffer<uint8_t>> buffers{
879 buffer<uint8_t>{nullptr, 0}, // validity bitmap
880 std::move(buffers_parts.length_buffer),
881 std::move(buffers_parts.long_string_storage),
882 std::move(buffers_parts.buffer_sizes).extract_storage()
883 };
884 const auto size = range_size(range);
885
886 constexpr repeat_view<bool> children_ownership(true, 0);
887
888 // create arrow array
889 ArrowArray arr = make_arrow_array(
890 static_cast<std::int64_t>(size), // length
891 static_cast<int64_t>(0),
892 0, // offset
893 std::move(buffers),
894 nullptr, // children
896 nullptr, // dictionary
897 true
898 );
899
900 arrow_proxy proxy{std::move(arr), std::move(schema)};
901 Ext::init(proxy);
902 return proxy;
903 }
904
905 template <std::ranges::sized_range T, class CR, typename Ext>
906 template <std::ranges::input_range VALUE_BUFFERS_RANGE, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
907 requires std::convertible_to<std::ranges::range_value_t<VALUE_BUFFERS_RANGE>, u8_buffer<uint8_t>>
908 arrow_proxy variable_size_binary_view_array_impl<T, CR, Ext>::create_proxy(
909 size_t element_count,
911 VALUE_BUFFERS_RANGE&& value_buffers,
912 VB&& validity_input,
913 std::optional<std::string_view> name,
914 std::optional<METADATA_RANGE> metadata
915 )
916 {
917 const auto size = buffer_view.size() / DATA_BUFFER_SIZE;
918 SPARROW_ASSERT_TRUE(size == element_count);
919
920 static const std::optional<std::unordered_set<sparrow::ArrowFlag>> flags{{ArrowFlag::NULLABLE}};
921
922 ArrowSchema schema = create_arrow_schema(std::move(name), std::move(metadata), flags);
923
924 auto bitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
925 std::vector<buffer<uint8_t>> buffers{std::move(bitmap).extract_storage(), std::move(buffer_view)};
926 for (auto&& buf : value_buffers)
927 {
928 buffers.push_back(std::forward<decltype(buf)>(buf));
929 }
930
931 // Create buffer sizes for the variadic buffers
932 u8_buffer<int64_t> buffer_sizes(value_buffers.size());
933 for (std::size_t i = 0; i < value_buffers.size(); ++i)
934 {
935 buffer_sizes[i] = static_cast<int64_t>(value_buffers[i].size());
936 }
937 buffers.push_back(std::move(buffer_sizes).extract_storage());
938
939 constexpr repeat_view<bool> children_ownership(true, 0);
940
941 ArrowArray arr = make_arrow_array(
942 static_cast<std::int64_t>(size), // length
943 static_cast<std::int64_t>(bitmap.null_count()), // null_count
944 0, // offset
945 std::move(buffers),
946 nullptr, // children
948 nullptr, // dictionary
949 true
950 );
951
952 arrow_proxy proxy{std::move(arr), std::move(schema)};
953 Ext::init(proxy);
954 return proxy;
955 }
956
957 template <std::ranges::sized_range T, class CR, typename Ext>
958 constexpr auto variable_size_binary_view_array_impl<T, CR, Ext>::value(size_type i) -> inner_reference
959 {
960 return static_cast<const self_type*>(this)->value(i);
961 }
962
963 template <std::ranges::sized_range T, class CR, typename Ext>
964 constexpr auto variable_size_binary_view_array_impl<T, CR, Ext>::value(size_type i) const
965 -> inner_const_reference
966 {
967#ifdef __GNUC__
968# pragma GCC diagnostic push
969# pragma GCC diagnostic ignored "-Wcast-align"
970#endif
971
972 SPARROW_ASSERT_TRUE(i < this->size());
973 using char_or_byte = typename inner_const_reference::value_type;
974
975 auto data_ptr = this->get_arrow_proxy().buffers()[LENGTH_BUFFER_INDEX].template data<uint8_t>()
976 + (i * DATA_BUFFER_SIZE);
977 const auto length = static_cast<std::size_t>(read_int32_unaligned(data_ptr));
978
979 if (length <= SHORT_STRING_SIZE)
980 {
981 const auto ptr = reinterpret_cast<const char_or_byte*>(data_ptr);
982 const auto ret = inner_const_reference(ptr + SHORT_STRING_OFFSET, length);
983 return ret;
984 }
985 else
986 {
987 const auto buffer_index = static_cast<std::size_t>(
988 read_int32_unaligned(data_ptr + BUFFER_INDEX_OFFSET)
989 );
990 const auto buffer_offset = static_cast<std::size_t>(
991 read_int32_unaligned(data_ptr + BUFFER_OFFSET_OFFSET)
992 );
993 const auto buffer = this->get_arrow_proxy()
994 .buffers()[buffer_index + FIRST_VAR_DATA_BUFFER_INDEX]
995 .template data<const char_or_byte>();
996 return inner_const_reference(buffer + buffer_offset, length);
997 }
998
999#ifdef __GNUC__
1000# pragma GCC diagnostic pop
1001#endif
1002 }
1003
1004 template <std::ranges::sized_range T, class CR, typename Ext>
1005 constexpr auto variable_size_binary_view_array_impl<T, CR, Ext>::value_begin() -> value_iterator
1006 {
1008 }
1009
1010 template <std::ranges::sized_range T, class CR, typename Ext>
1011 constexpr auto variable_size_binary_view_array_impl<T, CR, Ext>::value_end() -> value_iterator
1012 {
1013 return value_iterator(detail::layout_value_functor<self_type, inner_reference>(this), this->size());
1014 }
1015
1016 template <std::ranges::sized_range T, class CR, typename Ext>
1017 constexpr auto variable_size_binary_view_array_impl<T, CR, Ext>::value_cbegin() const
1018 -> const_value_iterator
1019 {
1021 }
1022
1023 template <std::ranges::sized_range T, class CR, typename Ext>
1024 constexpr auto variable_size_binary_view_array_impl<T, CR, Ext>::value_cend() const -> const_value_iterator
1025 {
1026 return const_value_iterator(
1028 this->size()
1029 );
1030 }
1031
1032 template <std::ranges::sized_range T, class CR, typename Ext>
1033 template <std::ranges::sized_range U>
1035 constexpr void variable_size_binary_view_array_impl<T, CR, Ext>::assign(U&& rhs, size_type index)
1036 {
1037 SPARROW_ASSERT_TRUE(index < this->size());
1038 const auto new_length = static_cast<std::size_t>(std::ranges::size(rhs));
1039
1040 auto& length_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[LENGTH_BUFFER_INDEX];
1041 auto view_ptr = length_buffer.data() + (index * DATA_BUFFER_SIZE);
1042 const auto current_length = static_cast<std::size_t>(read_int32_unaligned(view_ptr));
1043
1044 // Update the length in the view structure
1045 write_int32_unaligned(view_ptr, static_cast<std::int32_t>(new_length));
1046
1047 if (new_length <= SHORT_STRING_SIZE)
1048 {
1049 auto data_ptr = view_ptr + SHORT_STRING_OFFSET;
1050 std::ranges::copy(rhs, reinterpret_cast<typename T::value_type*>(data_ptr));
1051
1052 // Clear any remaining bytes in the inline storage
1053 if (new_length < SHORT_STRING_SIZE)
1054 {
1055 std::fill_n(
1056 reinterpret_cast<typename T::value_type*>(data_ptr) + new_length,
1057 SHORT_STRING_SIZE - new_length,
1058 typename T::value_type{}
1059 );
1060 }
1061 }
1062 else
1063 {
1064 // Handle assignment of long strings (> 12 bytes)
1065 // This requires managing the variadic buffers and potentially reorganizing the layout
1066
1067 auto& buffers = this->get_arrow_proxy().get_array_private_data()->buffers();
1068 auto& var_data_buffer = buffers[FIRST_VAR_DATA_BUFFER_INDEX];
1069 auto& buffer_sizes_buffer = buffers[buffers.size() - 1]; // Last buffer contains sizes
1070
1071 const bool was_long_string = current_length > SHORT_STRING_SIZE;
1072 std::size_t current_buffer_offset = 0;
1073
1074 if (was_long_string)
1075 {
1076 current_buffer_offset = static_cast<std::size_t>(
1077 read_int32_unaligned(view_ptr + BUFFER_OFFSET_OFFSET)
1078 );
1079 }
1080
1081 auto transformed_data = rhs
1082 | std::ranges::views::transform(
1083 transform_to<typename T::value_type, typename T::value_type>
1084 );
1085
1086 // Check for memory reuse optimization: if the new value is identical to existing data
1087 bool can_reuse_memory = false;
1088 if (was_long_string && new_length == current_length)
1089 {
1090 const auto* existing_data = var_data_buffer.data() + current_buffer_offset;
1091 can_reuse_memory = std::ranges::equal(
1092 transformed_data,
1093 std::span<const typename T::value_type>(
1094 reinterpret_cast<const typename T::value_type*>(existing_data),
1095 new_length
1096 )
1097 );
1098 }
1099
1100 if (can_reuse_memory)
1101 {
1102 // Data is identical - just update the view structure prefix and we're done
1103 auto prefix_range = rhs | std::ranges::views::take(PREFIX_SIZE);
1104 auto prefix_transformed = prefix_range
1105 | std::ranges::views::transform(
1106 transform_to<std::uint8_t, typename T::value_type>
1107 );
1108 std::ranges::copy(prefix_transformed, view_ptr + PREFIX_OFFSET);
1109 return; // Early exit - no buffer management needed
1110 }
1111
1112 // Calculate space requirements and buffer management strategy
1113 const auto length_diff = static_cast<std::ptrdiff_t>(new_length)
1114 - static_cast<std::ptrdiff_t>(current_length);
1115 const bool can_fit_in_place = was_long_string && length_diff <= 0;
1116
1117 std::size_t final_offset = 0;
1118
1119 if (can_fit_in_place)
1120 {
1121 // We can reuse the existing space (new data is same size or smaller)
1122 final_offset = current_buffer_offset;
1123
1124 // If the new data is smaller, we need to compact the buffer
1125 if (length_diff < 0)
1126 {
1127 const auto bytes_to_compact = static_cast<std::size_t>(-length_diff);
1128 const auto move_start = current_buffer_offset + current_length;
1129 const auto move_end = var_data_buffer.size();
1130 const auto bytes_to_move = move_end - move_start;
1131
1132 if (bytes_to_move > 0)
1133 {
1134 std::move(
1135 var_data_buffer.data() + move_start,
1136 var_data_buffer.data() + move_end,
1137 var_data_buffer.data() + move_start - bytes_to_compact
1138 );
1139 }
1140
1141 var_data_buffer.resize(var_data_buffer.size() - bytes_to_compact);
1142
1143 // Update buffer offsets for all elements that come after this one
1144 update_buffer_offsets_after(
1145 length_buffer.data(),
1146 this->size(),
1147 DATA_BUFFER_SIZE,
1148 SHORT_STRING_SIZE,
1149 BUFFER_OFFSET_OFFSET,
1150 current_buffer_offset + current_length,
1151 -static_cast<std::ptrdiff_t>(bytes_to_compact),
1152 index
1153 );
1154
1155 // Update buffer sizes metadata
1156 update_buffer_sizes_metadata(
1157 buffer_sizes_buffer,
1158 static_cast<std::int64_t>(var_data_buffer.size())
1159 );
1160 }
1161 }
1162 else
1163 {
1164 // Need to expand buffer or assign to new location
1165 const auto expansion_needed = was_long_string ? length_diff
1166 : static_cast<std::ptrdiff_t>(new_length);
1167 const auto new_var_buffer_size = var_data_buffer.size() + expansion_needed;
1168
1169 if (was_long_string && length_diff > 0)
1170 {
1171 // Expand in-place: move data after current element to make space
1172 final_offset = current_buffer_offset;
1173 const auto expansion_bytes = static_cast<std::size_t>(length_diff);
1174 const auto move_start = current_buffer_offset + current_length;
1175 const auto bytes_to_move = var_data_buffer.size() - move_start;
1176
1177 // Resize buffer first
1178 var_data_buffer.resize(new_var_buffer_size);
1179
1180 if (bytes_to_move > 0)
1181 {
1182 // Move data to make space for expansion
1183 std::move_backward(
1184 var_data_buffer.data() + move_start,
1185 var_data_buffer.data() + move_start + bytes_to_move,
1186 var_data_buffer.data() + move_start + bytes_to_move + expansion_bytes
1187 );
1188 }
1189
1190 // Update buffer offsets for all elements that come after this one
1191 update_buffer_offsets_after(
1192 length_buffer.data(),
1193 this->size(),
1194 DATA_BUFFER_SIZE,
1195 SHORT_STRING_SIZE,
1196 BUFFER_OFFSET_OFFSET,
1197 move_start - 1, // threshold is just before move_start
1198 static_cast<std::ptrdiff_t>(expansion_bytes),
1199 index
1200 );
1201 }
1202 else
1203 {
1204 // Append to end of buffer (new long string)
1205 final_offset = var_data_buffer.size();
1206 var_data_buffer.resize(new_var_buffer_size);
1207 }
1208
1209 // Update buffer sizes metadata
1210 update_buffer_sizes_metadata(buffer_sizes_buffer, static_cast<std::int64_t>(new_var_buffer_size));
1211 }
1212
1213 std::ranges::copy(transformed_data, var_data_buffer.data() + final_offset);
1214
1215 // Update view structure for long string format
1216 // Write prefix (first 4 bytes)
1217 auto prefix_range = rhs | std::ranges::views::take(PREFIX_SIZE);
1218 auto prefix_transformed = prefix_range
1219 | std::ranges::views::transform(
1220 transform_to<std::uint8_t, typename T::value_type>
1221 );
1222 std::ranges::copy(prefix_transformed, view_ptr + PREFIX_OFFSET);
1223
1224 write_int32_unaligned(
1225 view_ptr + BUFFER_INDEX_OFFSET,
1226 static_cast<std::int32_t>(FIRST_VAR_DATA_BUFFER_INDEX)
1227 );
1228
1229 write_int32_unaligned(view_ptr + BUFFER_OFFSET_OFFSET, static_cast<std::int32_t>(final_offset));
1230 }
1231 }
1232
1233 template <std::ranges::sized_range T, class CR, typename Ext>
1234 template <std::ranges::sized_range U>
1236 void variable_size_binary_view_array_impl<T, CR, Ext>::resize_values(size_type new_length, U value)
1237 {
1238 const size_t current_size = this->size();
1239
1240 if (new_length == current_size)
1241 {
1242 return;
1243 }
1244
1245 if (new_length < current_size)
1246 {
1247 erase_values(sparrow::next(value_cbegin(), new_length), current_size - new_length);
1248 }
1249 else
1250 {
1251 insert_value(value_cend(), value, new_length - current_size);
1252 }
1253 }
1254
1255 template <std::ranges::sized_range T, class CR, typename Ext>
1256 template <std::ranges::sized_range U>
1258 auto
1259 variable_size_binary_view_array_impl<T, CR, Ext>::insert_value(const_value_iterator pos, U value, size_type count)
1261 {
1262 const auto repeat_view = sparrow::repeat_view<U>(value, count);
1263 return insert_values(pos, std::ranges::begin(repeat_view), std::ranges::end(repeat_view));
1264 }
1265
1266 template <std::ranges::sized_range T, class CR, typename Ext>
1267 template <mpl::iterator_of_type<T> InputIt>
1268 auto variable_size_binary_view_array_impl<T, CR, Ext>::insert_values(
1270 InputIt first,
1271 InputIt last
1272 ) -> value_iterator
1273 {
1274 SPARROW_ASSERT_TRUE(first <= last);
1275 const size_type count = static_cast<size_type>(std::distance(first, last));
1276 if (count == 0)
1277 {
1278 const auto insert_index = std::distance(value_cbegin(), pos);
1279 return value_begin() + insert_index;
1280 }
1281
1282 const auto insert_index = static_cast<size_t>(std::distance(value_cbegin(), pos));
1283 const auto current_size = this->size();
1284 const auto new_size = current_size + count;
1285
1286 // Calculate total additional variadic storage needed
1287 std::size_t additional_var_storage = 0;
1288 std::vector<std::size_t> value_lengths;
1289 value_lengths.reserve(count);
1290
1291 for (auto it = first; it != last; ++it)
1292 {
1293 const auto length = static_cast<std::size_t>(std::ranges::size(*it));
1294 value_lengths.push_back(length);
1295 if (length > SHORT_STRING_SIZE)
1296 {
1297 additional_var_storage += length;
1298 }
1299 }
1300
1301 auto& proxy = this->get_arrow_proxy();
1302 auto* private_data = proxy.get_array_private_data();
1303 auto& buffers = private_data->buffers();
1304
1305 const auto new_view_buffer_size = new_size * DATA_BUFFER_SIZE;
1306 buffers[LENGTH_BUFFER_INDEX].resize(new_view_buffer_size);
1307
1308 if (additional_var_storage > 0)
1309 {
1310 const auto current_var_size = buffers[FIRST_VAR_DATA_BUFFER_INDEX].size();
1311 buffers[FIRST_VAR_DATA_BUFFER_INDEX].resize(current_var_size + additional_var_storage);
1312 }
1313
1314 auto& buffer_sizes = buffers[buffers.size() - 1];
1315 update_buffer_sizes_metadata(
1316 buffer_sizes,
1317 static_cast<std::int64_t>(buffers[FIRST_VAR_DATA_BUFFER_INDEX].size())
1318 );
1319
1320 // Shift existing view structures after insertion point
1321 auto* view_data = buffers[LENGTH_BUFFER_INDEX].data();
1322 if (insert_index < current_size)
1323 {
1324 const auto bytes_to_move = (current_size - insert_index) * DATA_BUFFER_SIZE;
1325 const auto src_offset = insert_index * DATA_BUFFER_SIZE;
1326 const auto dst_offset = (insert_index + count) * DATA_BUFFER_SIZE;
1327
1328 std::memmove(view_data + dst_offset, view_data + src_offset, bytes_to_move);
1329
1330 // Update buffer offsets for moved long strings
1331 if (additional_var_storage > 0)
1332 {
1333 for (size_type i = insert_index + count; i < new_size; ++i)
1334 {
1335 auto* view_ptr = view_data + (i * DATA_BUFFER_SIZE);
1336 std::int32_t length;
1337 std::memcpy(&length, view_ptr, sizeof(std::int32_t));
1338
1339 if (static_cast<std::size_t>(length) > SHORT_STRING_SIZE)
1340 {
1341 std::int32_t current_offset;
1342 std::memcpy(&current_offset, view_ptr + BUFFER_OFFSET_OFFSET, sizeof(std::int32_t));
1343 current_offset += static_cast<std::int32_t>(additional_var_storage);
1344 std::memcpy(view_ptr + BUFFER_OFFSET_OFFSET, &current_offset, sizeof(std::int32_t));
1345 }
1346 }
1347 }
1348 }
1349
1350 // Insert new view structures
1351 std::size_t var_offset = buffers[FIRST_VAR_DATA_BUFFER_INDEX].size() - additional_var_storage;
1352 size_type value_idx = 0;
1353
1354 for (auto it = first; it != last; ++it, ++value_idx)
1355 {
1356 const auto view_index = insert_index + value_idx;
1357 auto* view_ptr = view_data + (view_index * DATA_BUFFER_SIZE);
1358 const auto value_length = value_lengths[value_idx];
1359
1360 const auto& current_value = *it;
1361
1362 // Write length
1363 const std::int32_t value_length_int32 = static_cast<std::int32_t>(value_length);
1364 std::memcpy(view_ptr, &value_length_int32, sizeof(std::int32_t));
1365
1366 if (value_length <= SHORT_STRING_SIZE)
1367 {
1368 // Store inline - convert and copy elements manually
1369 std::ranges::transform(
1370 current_value,
1371 view_ptr + SHORT_STRING_OFFSET,
1372 transform_to<std::uint8_t, typename T::value_type>
1373 );
1374
1375 std::fill(
1376 view_ptr + SHORT_STRING_OFFSET + value_length,
1377 view_ptr + DATA_BUFFER_SIZE,
1378 std::uint8_t(0)
1379 );
1380 }
1381 else
1382 {
1383 // Store prefix - copy first PREFIX_SIZE elements manually
1384 std::ranges::transform(
1385 current_value | std::views::take(PREFIX_SIZE),
1386 view_ptr + PREFIX_OFFSET,
1387 transform_to<std::uint8_t, typename T::value_type>
1388 );
1389
1390 // Set buffer index
1391 const std::int32_t buffer_index_zero = 0;
1392 std::memcpy(view_ptr + BUFFER_INDEX_OFFSET, &buffer_index_zero, sizeof(std::int32_t));
1393
1394 // Set buffer offset
1395 const std::int32_t var_offset_int32 = static_cast<std::int32_t>(var_offset);
1396 std::memcpy(view_ptr + BUFFER_OFFSET_OFFSET, &var_offset_int32, sizeof(std::int32_t));
1397
1398 // Copy data to variadic buffer - convert and copy manually
1399 std::ranges::transform(
1400 current_value,
1401 buffers[FIRST_VAR_DATA_BUFFER_INDEX].data() + var_offset,
1402 transform_to<std::uint8_t, typename T::value_type>
1403 );
1404
1405 var_offset += value_length;
1406 }
1407 }
1408
1409 // Update buffers
1410 proxy.update_buffers();
1411
1412 return value_begin() + static_cast<difference_type>(insert_index);
1413 }
1414
1415 template <std::ranges::sized_range T, class CR, typename Ext>
1416 auto
1417 variable_size_binary_view_array_impl<T, CR, Ext>::erase_values(const_value_iterator pos, size_type count)
1418 -> value_iterator
1419 {
1420 const size_t erase_index = static_cast<size_t>(std::distance(value_cbegin(), pos));
1421 const size_t current_size = this->size();
1422
1423 // Validate bounds and handle zero count
1424 if (erase_index + count > current_size)
1425 {
1426 count = current_size - erase_index;
1427 }
1428
1429 if (count == 0)
1430 {
1431 return value_begin() + static_cast<difference_type>(erase_index);
1432 }
1433
1434 const auto new_size = current_size - count;
1435
1436 // Calculate how much variadic storage will be freed
1437 std::size_t freed_var_storage = 0;
1438 auto& proxy = this->get_arrow_proxy();
1439 auto* private_data = proxy.get_array_private_data();
1440 auto& buffers = private_data->buffers();
1441 auto* view_data = buffers[LENGTH_BUFFER_INDEX].data();
1442
1443 // Calculate freed storage from elements being erased
1444 for (size_type i = erase_index; i < erase_index + count; ++i)
1445 {
1446 auto* view_ptr = view_data + (i * DATA_BUFFER_SIZE);
1447 std::int32_t length;
1448 std::memcpy(&length, view_ptr, sizeof(std::int32_t));
1449 if (static_cast<std::size_t>(length) > SHORT_STRING_SIZE)
1450 {
1451 freed_var_storage += static_cast<std::size_t>(length);
1452 }
1453 }
1454
1455 // Handle empty array case
1456 if (new_size == 0)
1457 {
1458 // Resize all buffers to empty
1459 if (buffers[0].size() > 0)
1460 {
1461 buffers[0].clear();
1462 }
1463 buffers[LENGTH_BUFFER_INDEX].clear();
1464 buffers[FIRST_VAR_DATA_BUFFER_INDEX].clear();
1465
1466 auto& buffer_sizes = buffers[buffers.size() - 1];
1467 update_buffer_sizes_metadata(buffer_sizes, 0);
1468
1469 proxy.update_buffers();
1470 return value_begin();
1471 }
1472
1473 // Compact variadic buffer if needed
1474 if (freed_var_storage > 0)
1475 {
1476 auto& var_buffer = buffers[FIRST_VAR_DATA_BUFFER_INDEX];
1477 std::size_t write_offset = 0;
1478
1479 // Create mapping of old offsets to new offsets
1480 std::unordered_map<std::size_t, std::size_t> offset_mapping;
1481 offset_mapping.reserve(current_size - count);
1482
1483 for (size_type i = 0; i < current_size; ++i)
1484 {
1485 if (i >= erase_index && i < erase_index + count)
1486 {
1487 // Skip erased elements
1488 continue;
1489 }
1490
1491 auto* view_ptr = view_data + (i * DATA_BUFFER_SIZE);
1492 std::int32_t length;
1493 std::memcpy(&length, view_ptr, sizeof(std::int32_t));
1494 if (static_cast<std::size_t>(length) > SHORT_STRING_SIZE)
1495 {
1496 std::int32_t old_offset_int32;
1497 std::memcpy(&old_offset_int32, view_ptr + BUFFER_OFFSET_OFFSET, sizeof(std::int32_t));
1498 const auto old_offset = static_cast<std::size_t>(old_offset_int32);
1499
1500 // Record mapping for updating view structures later
1501 offset_mapping[old_offset] = write_offset;
1502
1503 // Move data if needed
1504 if (write_offset != old_offset)
1505 {
1506 std::memmove(
1507 var_buffer.data() + write_offset,
1508 var_buffer.data() + old_offset,
1509 static_cast<std::size_t>(length)
1510 );
1511 }
1512
1513 write_offset += static_cast<std::size_t>(length);
1514 }
1515 }
1516
1517 // Resize variadic buffer
1518 var_buffer.resize(var_buffer.size() - freed_var_storage);
1519
1520 // Update buffer sizes metadata
1521 auto& buffer_sizes = buffers[buffers.size() - 1];
1522 update_buffer_sizes_metadata(buffer_sizes, static_cast<std::int64_t>(var_buffer.size()));
1523
1524 // Update view structure offsets
1525 for (size_type i = 0; i < current_size; ++i)
1526 {
1527 if (i >= erase_index && i < erase_index + count)
1528 {
1529 continue; // Skip erased elements
1530 }
1531
1532 auto* view_ptr = view_data + (i * DATA_BUFFER_SIZE);
1533 std::int32_t length;
1534 std::memcpy(&length, view_ptr, sizeof(std::int32_t));
1535 if (static_cast<std::size_t>(length) > SHORT_STRING_SIZE)
1536 {
1537 std::int32_t old_offset_int32;
1538 std::memcpy(&old_offset_int32, view_ptr + BUFFER_OFFSET_OFFSET, sizeof(std::int32_t));
1539 const auto old_offset = static_cast<std::size_t>(old_offset_int32);
1540 auto it = offset_mapping.find(old_offset);
1541 if (it != offset_mapping.end())
1542 {
1543 const std::int32_t new_offset = static_cast<std::int32_t>(it->second);
1544 std::memcpy(view_ptr + BUFFER_OFFSET_OFFSET, &new_offset, sizeof(std::int32_t));
1545 }
1546 }
1547 }
1548 }
1549
1550 // Compact view buffer - move elements after erase range
1551 if (erase_index + count < current_size)
1552 {
1553 const auto src_offset = (erase_index + count) * DATA_BUFFER_SIZE;
1554 const auto dst_offset = erase_index * DATA_BUFFER_SIZE;
1555 const auto bytes_to_move = (current_size - erase_index - count) * DATA_BUFFER_SIZE;
1556
1557 std::memmove(view_data + dst_offset, view_data + src_offset, bytes_to_move);
1558 }
1559
1560 // Resize view buffer
1561 buffers[LENGTH_BUFFER_INDEX].resize(new_size * DATA_BUFFER_SIZE);
1562
1563 // Update buffers
1564 proxy.update_buffers();
1565
1566 // Return iterator to element after last erased, or end if we erased to the end
1567 return erase_index < new_size ? sparrow::next(value_begin(), erase_index) : value_end();
1568 }
1569
1570}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::bitmap_iterator bitmap_iterator
typename base_type::iterator_tag iterator_tag
typename base_type::const_bitmap_iterator const_bitmap_iterator
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
constexpr size_type size() const noexcept
Object that owns a piece of contiguous memory.
Definition buffer.hpp:113
constexpr U * data() noexcept
Definition buffer.hpp:630
constexpr size_type null_count() const noexcept
Returns the number of bits set to false (null/invalid).
A view that repeats a value a given number of times.
This buffer class is used as storage buffer for all sparrow arrays.
variable_size_binary_view_array_impl(Args &&... args)
Generic constructor for creating variable-size binary view array.
variable_size_binary_view_array_impl(arrow_proxy)
Constructs variable-size binary view array from Arrow proxy.
Concept for convertible range types.
Definition mp_utils.hpp:931
#define SPARROW_ASSERT_TRUE(expr__)
constexpr std::size_t size(typelist< T... >={})
Gets the count of types contained in a typelist.
Definition mp_utils.hpp:216
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
constexpr std::ranges::copy_result< std::ranges::borrowed_iterator_t< R >, O > copy(R &&r, O result)
Definition ranges.hpp:117
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient alias for arrays with mutable validity bitmaps.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_variable_size_binary_view_array
Checks whether T is a variable_size_binary_view_array_impl type.
SPARROW_API std::size_t array_size(const array_wrapper &ar)
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
constexpr std::size_t range_size(R &&r)
Definition ranges.hpp:32
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
variable_size_binary_view_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference > string_view_array
A variable-size string view layout implementation.
dynamic_bitset< std::uint8_t > validity_bitmap
Type alias for a validity bitmap using 8-bit storage blocks.
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
Ensures a validity bitmap of the specified size from various input types.
variable_size_binary_view_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference > binary_view_array
A variable-size binary view layout implementation.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
functor_index_iterator< detail::layout_value_functor< const array_type, inner_const_reference > > const_value_iterator
functor_index_iterator< detail::layout_value_functor< array_type, inner_reference > > value_iterator
Base class for array_inner_types specializations.
Traits class that must be specialized by array implementations.
Provides compile-time information about Arrow data types.
Metafunction for retrieving the data_type of a typed array.