sparrow 2.0.0
C++20 idiomatic APIs for the Apache Arrow Columnar Format
Loading...
Searching...
No Matches
variable_size_binary_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <limits>
20#include <numeric>
21#include <optional>
22#include <ranges>
23#include <stdexcept>
24#include <string>
25#include <vector>
26
39
40namespace sparrow
41{
42 namespace detail
43 {
44 template <class T, class OT>
46
47 template <>
48 struct variable_size_binary_format<std::string, std::int32_t>
49 {
50 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
51 {
52 return "u";
53 }
54 };
55
56 template <>
57 struct variable_size_binary_format<std::string, std::int64_t>
58 {
59 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
60 {
61 return "U";
62 }
63 };
64
65 template <>
66 struct variable_size_binary_format<std::vector<byte_t>, std::int32_t>
67 {
68 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
69 {
70 return "z";
71 }
72 };
73
74 template <>
75 struct variable_size_binary_format<std::vector<byte_t>, std::int64_t>
76 {
77 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
78 {
79 return "Z";
80 }
81 };
82 }
83
84
85 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext = empty_extension>
86 class variable_size_binary_array_impl;
87
88 template <layout_offset OT, typename Ext = empty_extension>
92 OT,
93 Ext>;
94
95 template <layout_offset OT, typename Ext = empty_extension>
98 arrow_traits<std::vector<byte_t>>::const_reference,
99 OT,
100 Ext>;
101
115
129
143
157
158 namespace detail
159 {
160 template <>
162 {
163 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
164 {
166 }
167 };
168
169 template <>
171 {
172 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
173 {
175 }
176 };
177
178 template <>
180 {
181 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
182 {
184 }
185 };
186
187 template <>
189 {
190 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
191 {
193 }
194 };
195 }
196
200 template <class T>
201 constexpr bool is_string_array_v = std::same_as<T, string_array>;
202
206 template <class T>
207 constexpr bool is_big_string_array_v = std::same_as<T, big_string_array>;
208
212 template <class T>
213 constexpr bool is_binary_array_v = std::same_as<T, binary_array>;
214
218 template <class T>
219 constexpr bool is_big_binary_array_v = std::same_as<T, big_binary_array>;
220
221 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
223 {
225
229 using offset_type = OT;
230
231 using data_value_type = typename T::value_type;
232
233 using offset_iterator = OT*;
234 using const_offset_iterator = const OT*;
235
238
239 using iterator_tag = std::random_access_iterator_tag;
240
242
251
253
262
264
265 // using iterator = layout_iterator<array_type, false>;
266 // using const_iterator = layout_iterator<array_type, true, CR>;
267 };
268
277 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
279 : public mutable_array_bitmap_base<variable_size_binary_array_impl<T, CR, OT, Ext>>,
280 public Ext
281 {
282 private:
283
284 static_assert(
285 sizeof(std::ranges::range_value_t<T>) == sizeof(std::uint8_t),
286 "Only sequences of types with the same size as uint8_t are supported"
287 );
288
289 public:
290
293
295 using inner_value_type = typename inner_types::inner_value_type;
296 using inner_reference = typename inner_types::inner_reference;
297 using inner_const_reference = typename inner_types::inner_const_reference;
298
299 using offset_type = typename inner_types::offset_type;
304
306 using bitmap_reference = typename base_type::bitmap_reference;
309
313
314 using offset_iterator = typename inner_types::offset_iterator;
315 using const_offset_iterator = typename inner_types::const_offset_iterator;
316
320 using data_iterator = typename inner_types::data_iterator;
321
322 using const_data_iterator = typename inner_types::const_data_iterator;
323 using data_value_type = typename inner_types::data_value_type;
324
325 using value_iterator = typename inner_types::value_iterator;
326 using const_value_iterator = typename inner_types::const_value_iterator;
327
345
359 template <class... ARGS>
362 : self_type(create_proxy(std::forward<ARGS>(args)...))
363 {
364 }
365
366 using base_type::get_arrow_proxy;
367 using base_type::size;
368
381 [[nodiscard]] constexpr inner_reference value(size_type i);
382
398 [[nodiscard]] constexpr inner_const_reference value(size_type i) const;
399
416 template <std::ranges::range SIZES_RANGE>
417 [[nodiscard]] static constexpr auto offset_from_sizes(SIZES_RANGE&& sizes) -> offset_buffer_type;
418
419 private:
420
441 template <
444 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
445 [[nodiscard]] static arrow_proxy create_proxy(
446 u8_buffer<C>&& data_buffer,
447 offset_buffer_type&& list_offsets,
449 std::optional<std::string_view> name = std::nullopt,
450 std::optional<METADATA_RANGE> metadata = std::nullopt
451 );
452
471 template <
472 std::ranges::input_range R,
474 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
475 requires(
476 std::ranges::input_range<std::ranges::range_value_t<R>>
478 )
479 [[nodiscard]] static arrow_proxy create_proxy(
480 R&& values,
482 std::optional<std::string_view> name = std::nullopt,
483 std::optional<METADATA_RANGE> metadata = std::nullopt
484 );
485
503 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
504 requires(
505 std::ranges::input_range<std::ranges::range_value_t<R>>
506 && mpl::char_like<std::ranges::range_value_t<std::ranges::range_value_t<R>>>
507 )
508 [[nodiscard]] static arrow_proxy create_proxy(
509 R&& values,
510 bool nullable,
511 std::optional<std::string_view> name = std::nullopt,
512 std::optional<METADATA_RANGE> metadata = std::nullopt
513 );
514
529 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
530 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
531 [[nodiscard]] static arrow_proxy create_proxy(
532 R&&,
533 std::optional<std::string_view> name = std::nullopt,
534 std::optional<METADATA_RANGE> metadata = std::nullopt
535 );
536
554 template <mpl::char_like C, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
555 [[nodiscard]] static arrow_proxy create_proxy_impl(
556 u8_buffer<C>&& data_buffer,
557 offset_buffer_type&& list_offsets,
558 std::optional<validity_bitmap>&&,
559 std::optional<std::string_view> name = std::nullopt,
560 std::optional<METADATA_RANGE> metadata = std::nullopt
561 );
562
563 static constexpr size_t OFFSET_BUFFER_INDEX = 1;
564 static constexpr size_t DATA_BUFFER_INDEX = 2;
565
577 [[nodiscard]] constexpr offset_iterator offset(size_type i);
578
586 [[nodiscard]] constexpr offset_iterator offsets_begin();
587
595 [[nodiscard]] constexpr offset_iterator offsets_end();
596
608 [[nodiscard]] constexpr data_iterator data(size_type i);
609
617 [[nodiscard]] constexpr value_iterator value_begin();
618
626 [[nodiscard]] constexpr value_iterator value_end();
627
635 [[nodiscard]] constexpr const_value_iterator value_cbegin() const;
636
644 [[nodiscard]] constexpr const_value_iterator value_cend() const;
645
657 [[nodiscard]] constexpr const_offset_iterator offset(size_type i) const;
658
666 [[nodiscard]] constexpr const_offset_iterator offsets_cbegin() const;
667
675 [[nodiscard]] constexpr const_offset_iterator offsets_cend() const;
676
688 [[nodiscard]] constexpr const_data_iterator data(size_type i) const;
689
690 // Modifiers
691
704 template <std::ranges::sized_range U>
705 requires mpl::convertible_ranges<U, T>
706 constexpr void resize_values(size_type new_length, U value);
707
717 constexpr void resize_offsets(size_type new_length, offset_type offset_value);
718
734 template <std::ranges::sized_range U>
735 requires mpl::convertible_ranges<U, T>
736 constexpr value_iterator insert_value(const_value_iterator pos, U value, size_type count);
737
750 constexpr offset_iterator insert_offset(const_offset_iterator pos, offset_type size, size_type count);
751
768 template <mpl::iterator_of_type<T> InputIt>
769 constexpr value_iterator insert_values(const_value_iterator pos, InputIt first, InputIt last);
770
791 template <mpl::iterator_of_type<OT> InputIt>
792 constexpr offset_iterator
793 insert_offsets(const_offset_iterator pos, InputIt first_sizes, InputIt last_sizes);
794
812 constexpr value_iterator erase_values(const_value_iterator pos, size_type count);
813
830 constexpr offset_iterator erase_offsets(const_offset_iterator pos, size_type count);
831
847 template <std::ranges::sized_range U>
848 requires mpl::convertible_ranges<U, T>
849 constexpr void assign(U&& rhs, size_type index);
850
860 constexpr void check_offset_overflow(offset_type current_offset, offset_type size_to_add) const;
861
864 friend base_type;
867 };
868
869 /*********************************************
870 * variable_size_binary_array_impl implementation *
871 *********************************************/
872
873 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
875 : base_type(std::move(proxy))
876 {
877 const auto type = this->get_arrow_proxy().data_type();
880 || type == data_type::LARGE_BINARY
881 );
883 (((type == data_type::STRING || type == data_type::BINARY) && std::same_as<OT, int32_t>)
884 || ((type == data_type::LARGE_STRING || type == data_type::LARGE_BINARY)
885 && std::same_as<OT, int64_t>) )
886 );
887 }
888
889 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
890 template <std::ranges::range SIZES_RANGE>
893 {
895 std::forward<SIZES_RANGE>(sizes)
896 );
897 }
898
899 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
900 template <mpl::char_like C, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
901 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
902 u8_buffer<C>&& data_buffer,
903 offset_buffer_type&& offsets,
904 VB&& validity_input,
905 std::optional<std::string_view> name,
906 std::optional<METADATA_RANGE> metadata
907 )
908 {
909 const auto size = offsets.size() - 1;
910 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
911 const auto null_count = vbitmap.null_count();
912
915 std::move(name), // name
916 std::move(metadata), // metadata
917 std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE}), // flags,
918 nullptr, // children
919 repeat_view<bool>(true, 0),
920 nullptr, // dictionary
921 true
922
923 );
924 std::vector<buffer<std::uint8_t>> arr_buffs = {
925 std::move(vbitmap).extract_storage(),
926 std::move(offsets).extract_storage(),
927 std::move(data_buffer).extract_storage()
928 };
929
930 ArrowArray arr = make_arrow_array(
931 static_cast<std::int64_t>(size), // length
932 static_cast<int64_t>(null_count),
933 0, // offset
934 std::move(arr_buffs),
935 nullptr, // children
936 repeat_view<bool>(true, 0),
937 nullptr, // dictionary
938 true
939 );
940 return arrow_proxy{std::move(arr), std::move(schema)};
941 }
942
943 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
944 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
945 requires(
946 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
948 // range of char-like
949 )
950 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
951 R&& values,
952 VB&& validity_input,
953 std::optional<std::string_view> name,
954 std::optional<METADATA_RANGE> metadata
955 )
956 {
957 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
958
959 auto size_range = values
960 | std::views::transform(
961 [](const auto& v)
962 {
963 return std::ranges::size(v);
964 }
965 );
966 auto offset_buffer = offset_from_sizes(size_range);
967 auto data_buffer = u8_buffer<values_inner_value_type>(std::ranges::views::join(values));
968 return create_proxy(
969 std::move(data_buffer),
970 std::move(offset_buffer),
971 std::forward<VB>(validity_input),
972 std::forward<std::optional<std::string_view>>(name),
973 std::forward<std::optional<METADATA_RANGE>>(metadata)
974 );
975 }
976
977 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
978 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
979 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
980 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
981 R&& range,
982 std::optional<std::string_view> name,
983 std::optional<METADATA_RANGE> metadata
984 )
985 {
986 // split into values and is_non_null ranges
987 const auto values = range
988 | std::views::transform(
989 [](const auto& v)
990 {
991 return v.get();
992 }
993 );
994 const auto is_non_null = range
995 | std::views::transform(
996 [](const auto& v)
997 {
998 return v.has_value();
999 }
1000 );
1001 return self_type::create_proxy(values, is_non_null, std::move(name), std::move(metadata));
1002 }
1003
1004 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1005 template <
1006 std::ranges::input_range R,
1007 input_metadata_container METADATA_RANGE>
1008 requires(
1009 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
1011 // range of
1012 // char-like
1013 )
1014 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
1015 R&& values,
1016 bool nullable,
1017 std::optional<std::string_view> name,
1018 std::optional<METADATA_RANGE> metadata
1019 )
1020 {
1021 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
1022 const size_t size = std::ranges::size(values);
1023 u8_buffer<values_inner_value_type> data_buffer(std::ranges::views::join(values));
1024 auto size_range = values
1025 | std::views::transform(
1026 [](const auto& v)
1027 {
1028 return std::ranges::size(v);
1029 }
1030 );
1031 auto offset_buffer = offset_from_sizes(size_range);
1032 return create_proxy_impl(
1033 std::move(data_buffer),
1034 std::move(offset_buffer),
1035 nullable ? std::make_optional<validity_bitmap>(nullptr, size, validity_bitmap::default_allocator())
1036 : std::nullopt,
1037 std::move(name),
1038 std::move(metadata)
1039 );
1040 }
1041
1042 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1043 template <mpl::char_like C, input_metadata_container METADATA_RANGE>
1044 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy_impl(
1045 u8_buffer<C>&& data_buffer,
1046 offset_buffer_type&& list_offsets,
1047 std::optional<validity_bitmap>&& bitmap,
1048 std::optional<std::string_view> name,
1049 std::optional<METADATA_RANGE> metadata
1050 )
1051 {
1052 const auto size = list_offsets.size() - 1;
1053 const auto null_count = bitmap.has_value() ? bitmap->null_count() : 0;
1054
1055 const std::optional<std::unordered_set<sparrow::ArrowFlag>>
1056 flags = bitmap.has_value()
1057 ? std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE})
1058 : std::nullopt;
1059
1060 ArrowSchema schema = make_arrow_schema(
1062 std::move(name), // name
1063 std::move(metadata), // metadata
1064 flags, // flags,
1065 nullptr, // children
1066 repeat_view<bool>(true, 0),
1067 nullptr, // dictionary
1068 true
1069
1070 );
1071 std::vector<buffer<std::uint8_t>> arr_buffs = {
1072 bitmap.has_value() ? std::move(*bitmap).extract_storage()
1074 std::move(list_offsets).extract_storage(),
1075 std::move(data_buffer).extract_storage()
1076 };
1077
1078 ArrowArray arr = make_arrow_array(
1079 static_cast<std::int64_t>(size), // length
1080 static_cast<int64_t>(null_count),
1081 0, // offset
1082 std::move(arr_buffs),
1083 nullptr, // children
1084 repeat_view<bool>(true, 0),
1085 nullptr, // dictionary
1086 true
1087 );
1088 arrow_proxy proxy{std::move(arr), std::move(schema)};
1089 Ext::init(proxy);
1090 return proxy;
1091 }
1092
1093 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1094 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::data(size_type i) -> data_iterator
1095 {
1096 arrow_proxy& proxy = get_arrow_proxy();
1097 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
1098 return proxy.buffers()[DATA_BUFFER_INDEX].template data<data_value_type>() + i;
1099 }
1100
1101 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1102 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::data(size_type i) const
1103 -> const_data_iterator
1104 {
1105 const arrow_proxy& proxy = this->get_arrow_proxy();
1106 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
1107 return proxy.buffers()[DATA_BUFFER_INDEX].template data<const data_value_type>() + i;
1108 }
1109
1110 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1111 template <std::ranges::sized_range U>
1113 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::assign(U&& rhs, size_type index)
1114 {
1115 SPARROW_ASSERT_TRUE(index < size());
1116 const auto offset_beg = *offset(index);
1117 const auto offset_end = *offset(index + 1);
1118 const auto initial_value_length = offset_end - offset_beg;
1119 const auto new_value_length = static_cast<OT>(std::ranges::size(rhs));
1120 const OT shift_byte_count = new_value_length - initial_value_length;
1121 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1122 if (shift_byte_count != 0)
1123 {
1124 // Check for offset overflow before adjusting
1125 if (shift_byte_count > 0)
1126 {
1127 const offset_type last_offset = *offset(size());
1128 check_offset_overflow(last_offset, shift_byte_count);
1129 }
1130
1131 const auto shift_val_abs = static_cast<size_t>(std::abs(shift_byte_count));
1132 const auto new_data_buffer_size = shift_byte_count < 0 ? data_buffer.size() - shift_val_abs
1133 : data_buffer.size() + shift_val_abs;
1134
1135 if (shift_byte_count > 0)
1136 {
1137 data_buffer.resize(new_data_buffer_size);
1138 // Move elements to make space for the new value
1139 std::move_backward(
1140 sparrow::next(data_buffer.begin(), offset_end),
1141 sparrow::next(data_buffer.end(), -shift_byte_count),
1142 data_buffer.end()
1143 );
1144 }
1145 else
1146 {
1147 std::move(
1148 sparrow::next(data_buffer.begin(), offset_end),
1149 data_buffer.end(),
1150 sparrow::next(data_buffer.begin(), offset_end + shift_byte_count)
1151 );
1152 data_buffer.resize(new_data_buffer_size);
1153 }
1154 // Adjust offsets for subsequent elements
1155 std::for_each(
1156 offset(index + 1),
1157 offset(size() + 1),
1158 [shift_byte_count](auto& offset)
1159 {
1160 offset += shift_byte_count;
1161 }
1162 );
1163 }
1164 auto tmp = std::views::transform(
1165 rhs,
1166 [](const auto& val)
1167 {
1168 return static_cast<std::uint8_t>(val);
1169 }
1170 );
1171 // Copy the new value into the buffer
1172 std::copy(std::ranges::begin(tmp), std::ranges::end(tmp), sparrow::next(data_buffer.begin(), offset_beg));
1173 }
1174
1175 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1176 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::check_offset_overflow(
1177 offset_type current_offset,
1178 offset_type size_to_add
1179 ) const
1180 {
1181 constexpr offset_type max_offset = std::numeric_limits<offset_type>::max();
1182 if (current_offset > max_offset - size_to_add)
1183 {
1184 throw std::overflow_error("Offset overflow: adding elements would exceed maximum offset value");
1185 }
1186 }
1187
1188 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1189 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offset(size_type i) -> offset_iterator
1190 {
1191 SPARROW_ASSERT_TRUE(i <= size() + this->get_arrow_proxy().offset());
1192 return get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
1193 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
1194 }
1195
1196 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1197 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offset(size_type i) const
1198 -> const_offset_iterator
1199 {
1200 SPARROW_ASSERT_TRUE(i <= this->size() + this->get_arrow_proxy().offset());
1201 return this->get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
1202 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
1203 }
1204
1205 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1206 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_begin() -> offset_iterator
1207 {
1208 return offset(0);
1209 }
1210
1211 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1212 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_cbegin() const
1213 -> const_offset_iterator
1214 {
1215 return offset(0);
1216 }
1217
1218 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1219 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_end() -> offset_iterator
1220 {
1221 return offset(size() + 1);
1222 }
1223
1224 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1225 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_cend() const
1226 -> const_offset_iterator
1227 {
1228 return offset(size() + 1);
1229 }
1230
1231 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1233 {
1234 SPARROW_ASSERT_TRUE(i < size());
1235 return inner_reference(this, i);
1236 }
1237
1238 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1241 {
1242 SPARROW_ASSERT_TRUE(i < this->size());
1243 const OT offset_begin = *offset(i);
1244 SPARROW_ASSERT_TRUE(offset_begin >= 0);
1245 const OT offset_end = *offset(i + 1);
1246 SPARROW_ASSERT_TRUE(offset_end >= 0);
1247 const const_data_iterator pointer_begin = data(static_cast<size_t>(offset_begin));
1248 const const_data_iterator pointer_end = data(static_cast<size_t>(offset_end));
1249 return inner_const_reference(pointer_begin, pointer_end);
1250 }
1251
1252 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1253 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_begin() -> value_iterator
1254 {
1255 return value_iterator{this, 0};
1256 }
1257
1258 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1259 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_end() -> value_iterator
1260 {
1261 return sparrow::next(value_begin(), size());
1262 }
1263
1264 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1265 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_cbegin() const -> const_value_iterator
1266 {
1267 return const_value_iterator{this, 0};
1268 }
1269
1270 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1271 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_cend() const -> const_value_iterator
1272 {
1273 return sparrow::next(value_cbegin(), this->size());
1274 }
1275
1276 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1277 template <std::ranges::sized_range U>
1279 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::resize_values(size_type new_length, U value)
1280 {
1281 const size_t new_size = new_length + static_cast<size_t>(this->get_arrow_proxy().offset());
1282 auto& buffers = this->get_arrow_proxy().get_array_private_data()->buffers();
1283 if (new_length < size())
1284 {
1285 const auto offset_begin = static_cast<size_t>(*offset(new_length));
1286 auto& data_buffer = buffers[DATA_BUFFER_INDEX];
1287 data_buffer.resize(offset_begin);
1288 auto& offset_buffer = buffers[OFFSET_BUFFER_INDEX];
1289 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1290 offset_buffer_adaptor.resize(new_size + 1);
1291 }
1292 else if (new_length > size())
1293 {
1294 insert_value(value_cend(), value, new_length - size());
1295 }
1296 }
1297
1298 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1299 template <std::ranges::sized_range U>
1301 constexpr auto
1302 variable_size_binary_array_impl<T, CR, OT, Ext>::insert_value(const_value_iterator pos, U value, size_type count)
1304 {
1305 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
1306 const OT offset_begin = *offset(idx);
1307 const std::vector<uint8_t> casted_value{value.cbegin(), value.cend()};
1308 const repeat_view<std::vector<uint8_t>> my_repeat_view{casted_value, count};
1309 const auto joined_repeated_value_range = std::ranges::views::join(my_repeat_view);
1310 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1311 const auto pos_to_insert = sparrow::next(data_buffer.cbegin(), offset_begin);
1312 data_buffer.insert(pos_to_insert, joined_repeated_value_range.begin(), joined_repeated_value_range.end());
1313 insert_offset(offsets_cbegin() + idx + 1, static_cast<offset_type>(value.size()), count);
1314 return sparrow::next(value_begin(), idx);
1315 }
1316
1317 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1318 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_offset(
1319 const_offset_iterator pos,
1320 offset_type value_size,
1321 size_type count
1322 ) -> offset_iterator
1323 {
1324 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1325 const auto idx = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
1326 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1327 const offset_type cumulative_size = value_size * static_cast<offset_type>(count);
1328
1329 // Check for offset overflow before adjusting
1330 if (!offset_buffer_adaptor.empty())
1331 {
1332 const offset_type last_offset = offset_buffer_adaptor.back();
1333 check_offset_overflow(last_offset, cumulative_size);
1334 }
1335
1336 // Adjust offsets for subsequent elements
1337 std::for_each(
1338 sparrow::next(offset_buffer_adaptor.begin(), idx + 1),
1339 offset_buffer_adaptor.end(),
1340 [cumulative_size](auto& offset)
1341 {
1342 offset += cumulative_size;
1343 }
1344 );
1345 offset_buffer_adaptor.insert(sparrow::next(offset_buffer_adaptor.cbegin(), idx + 1), count, 0);
1346 // Put the right values in the new offsets
1347 for (size_t i = idx + 1; i < idx + 1 + count; ++i)
1348 {
1349 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + value_size;
1350 }
1351 return offsets_begin() + idx;
1352 }
1353
1354 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1355 template <mpl::iterator_of_type<T> InputIt>
1356 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_values(
1358 InputIt first,
1359 InputIt last
1360 ) -> value_iterator
1361 {
1362 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1363 auto data_buffer_adaptor = make_buffer_adaptor<data_value_type>(data_buffer);
1364 auto values = std::ranges::subrange(first, last);
1365 const size_t cumulative_sizes = std::accumulate(
1366 values.begin(),
1367 values.end(),
1368 size_t(0),
1369 [](size_t acc, const T& value)
1370 {
1371 return acc + value.size();
1372 }
1373 );
1374 data_buffer_adaptor.resize(data_buffer_adaptor.size() + cumulative_sizes);
1375 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
1376 const OT offset_begin = *offset(idx);
1377 auto insert_pos = sparrow::next(data_buffer_adaptor.begin(), offset_begin);
1378
1379 // Move elements to make space for the new value
1380 std::move_backward(
1381 insert_pos,
1382 sparrow::next(data_buffer_adaptor.end(), -static_cast<difference_type>(cumulative_sizes)),
1383 data_buffer_adaptor.end()
1384 );
1385
1386 for (const T& value : values)
1387 {
1388 std::copy(value.begin(), value.end(), insert_pos);
1389 std::advance(insert_pos, value.size());
1390 }
1391
1392 const auto sizes_of_each_value = std::ranges::views::transform(
1393 values,
1394 [](const T& value) -> offset_type
1395 {
1396 return static_cast<offset_type>(value.size());
1397 }
1398 );
1399 insert_offsets(offset(idx + 1), sizes_of_each_value.begin(), sizes_of_each_value.end());
1400 return sparrow::next(value_begin(), idx);
1401 }
1402
1403 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1404 template <mpl::iterator_of_type<OT> InputIt>
1405 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_offsets(
1407 InputIt first_sizes,
1408 InputIt last_sizes
1409 ) -> offset_iterator
1410 {
1411 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
1412 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
1413 SPARROW_ASSERT_TRUE(first_sizes <= last_sizes);
1414 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1415 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1416 const auto idx = std::distance(offsets_cbegin(), pos);
1417 const OT cumulative_sizes = std::reduce(first_sizes, last_sizes, OT(0));
1418
1419 // Check for offset overflow before adjusting
1420 if (!offset_buffer_adaptor.empty())
1421 {
1422 const offset_type last_offset = offset_buffer_adaptor.back();
1423 check_offset_overflow(last_offset, cumulative_sizes);
1424 }
1425
1426 const auto sizes_count = std::distance(first_sizes, last_sizes);
1427 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() + static_cast<size_t>(sizes_count));
1428 // Move the offsets to make space for the new offsets
1429 std::move_backward(
1430 offset_buffer_adaptor.begin() + idx,
1431 offset_buffer_adaptor.end() - sizes_count,
1432 offset_buffer_adaptor.end()
1433 );
1434 // Adjust offsets for subsequent elements
1435 std::for_each(
1436 offset_buffer_adaptor.begin() + idx + sizes_count,
1437 offset_buffer_adaptor.end(),
1438 [cumulative_sizes](auto& offset)
1439 {
1440 offset += cumulative_sizes;
1441 }
1442 );
1443 // Put the right values in the new offsets
1444 InputIt it = first_sizes;
1445 for (size_t i = static_cast<size_t>(idx + 1); i < static_cast<size_t>(idx + sizes_count + 1); ++i)
1446 {
1447 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + *it;
1448 ++it;
1449 }
1450 return offset(static_cast<size_t>(idx));
1451 }
1452
1453 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1454 constexpr auto
1455 variable_size_binary_array_impl<T, CR, OT, Ext>::erase_values(const_value_iterator pos, size_type count)
1456 -> value_iterator
1457 {
1458 SPARROW_ASSERT_TRUE(pos >= value_cbegin());
1459 SPARROW_ASSERT_TRUE(pos <= value_cend());
1460 const size_t index = static_cast<size_t>(std::distance(value_cbegin(), pos));
1461 if (count == 0)
1462 {
1463 return sparrow::next(value_begin(), index);
1464 }
1465 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1466 const auto offset_begin = *offset(index);
1467 const auto offset_end = *offset(index + count);
1468 const size_t difference = static_cast<size_t>(offset_end - offset_begin);
1469 // move the values after the erased ones
1470 std::move(data_buffer.begin() + offset_end, data_buffer.end(), data_buffer.begin() + offset_begin);
1471 data_buffer.resize(data_buffer.size() - difference);
1472 // adjust the offsets for the subsequent elements
1473 erase_offsets(offset(index), count);
1474 return sparrow::next(value_begin(), index);
1475 }
1476
1477 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1478 constexpr auto
1479 variable_size_binary_array_impl<T, CR, OT, Ext>::erase_offsets(const_offset_iterator pos, size_type count)
1480 -> offset_iterator
1481 {
1482 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
1483 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
1484 const size_t index = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
1485 if (count == 0)
1486 {
1487 return offset(index);
1488 }
1489 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1490 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1491 const OT offset_start_value = *offset(index);
1492 const OT offset_end_value = *offset(index + count);
1493 const OT difference = offset_end_value - offset_start_value;
1494 // move the offsets after the erased ones
1495 std::move(
1496 sparrow::next(offset_buffer_adaptor.begin(), index + count + 1),
1497 offset_buffer_adaptor.end(),
1498 sparrow::next(offset_buffer_adaptor.begin(), index + 1)
1499 );
1500 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() - count);
1501 // adjust the offsets for the subsequent elements
1502 std::for_each(
1503 sparrow::next(offset_buffer_adaptor.begin(), index + 1),
1504 offset_buffer_adaptor.end(),
1505 [difference](OT& offset)
1506 {
1507 offset -= difference;
1508 }
1509 );
1510 return offset(index);
1511 }
1512
1513}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::iterator_tag iterator_tag
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
Object that owns a piece of contiguous memory.
Definition buffer.hpp:114
xsimd::aligned_allocator< T > default_allocator
Definition buffer.hpp:126
constexpr size_type null_count() const noexcept
Returns the number of bits set to false (null/invalid).
bitset_iterator< self_type, true > const_iterator
typename storage_type::default_allocator default_allocator
A view that repeats a value a given number of times.
This buffer class is used as storage buffer for all sparrow arrays.
static constexpr auto offset_from_sizes(SIZES_RANGE &&sizes) -> offset_buffer_type
Creates offset buffer from a range of sizes.
constexpr inner_reference value(size_type i)
Gets mutable reference to element at specified index.
variable_size_binary_array_impl(ARGS &&... args)
Generic constructor for creating array from various inputs.
constexpr inner_const_reference value(size_type i) const
Gets const reference to element at specified index.
variable_size_binary_array_impl(arrow_proxy)
Constructs array from Arrow proxy.
Iterator over the data values of a variable size binary layout.
Concept for input containers that can provide metadata pairs.
Definition metadata.hpp:332
Concept for iterator types.
Concept for character-like types.
Concept for convertible range types.
Definition mp_utils.hpp:931
Concept defining valid input types for validity bitmap creation.
#define SPARROW_CONSTEXPR_GCC_11
Definition config.hpp:50
#define SPARROW_ASSERT_TRUE(expr__)
constexpr sparrow::u8_buffer< OFFSET_TYPE > offset_buffer_from_sizes(SIZES_RANGE &&sizes)
constexpr std::size_t size(typelist< T... >={})
Gets the count of types contained in a typelist.
Definition mp_utils.hpp:216
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
binary_array_impl< std::int64_t > big_binary_array
Type alias for variable-size binary arrays with 64-bit offsets.
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient alias for arrays with mutable validity bitmaps.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_big_string_array_v
Checks whether T is a big_string_array type.
constexpr bool is_string_array_v
Checks whether T is a string_array type.
variable_size_binary_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference, OT, Ext > binary_array_impl
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
string_array_impl< std::int64_t > big_string_array
Type alias for variable-size string arrays with 64-bit offsets.
string_array_impl< std::int32_t > string_array
Type alias for variable-size string arrays with 32-bit offsets.
constexpr bool is_big_binary_array_v
Checks whether T is a big_binary_array type.
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
Type alias for a validity bitmap using 8-bit storage blocks.
variable_size_binary_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference, OT, Ext > string_array_impl
constexpr bool is_binary_array_v
Checks whether T is a binary_array type.
binary_array_impl< std::int32_t > binary_array
Type alias for variable-size binary arrays with 32-bit offsets.
auto make_buffer_adaptor(FromBufferRef &buf)
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
Ensures a validity bitmap of the specified size from various input types.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
Extensions to the C++ standard library.
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
variable_size_binary_value_iterator< array_type, iterator_types > value_iterator
variable_size_binary_value_iterator< array_type, const_iterator_types > const_value_iterator
Base class for array_inner_types specializations.
Traits class that must be specialized by array implementations.
Provides compile-time information about Arrow data types.
Metafunction for retrieving the data_type of a typed array.