sparrow 2.2.1
C++20 idiomatic APIs for the Apache Arrow Columnar Format
Loading...
Searching...
No Matches
variable_size_binary_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <limits>
20#include <numeric>
21#include <optional>
22#include <ranges>
23#include <stdexcept>
24#include <string>
25#include <vector>
26
39
40namespace sparrow
41{
42 namespace detail
43 {
44 template <class T, class OT>
46
47 template <>
48 struct variable_size_binary_format<std::string, std::int32_t>
49 {
50 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
51 {
52 return "u";
53 }
54 };
55
56 template <>
57 struct variable_size_binary_format<std::string, std::int64_t>
58 {
59 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
60 {
61 return "U";
62 }
63 };
64
65 template <>
66 struct variable_size_binary_format<std::vector<byte_t>, std::int32_t>
67 {
68 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
69 {
70 return "z";
71 }
72 };
73
74 template <>
75 struct variable_size_binary_format<std::vector<byte_t>, std::int64_t>
76 {
77 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
78 {
79 return "Z";
80 }
81 };
82 }
83
84
85 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext = empty_extension>
86 class variable_size_binary_array_impl;
87
88 template <layout_offset OT, typename Ext = empty_extension>
92 OT,
93 Ext>;
94
95 template <layout_offset OT, typename Ext = empty_extension>
98 arrow_traits<std::vector<byte_t>>::const_reference,
99 OT,
100 Ext>;
101
115
129
143
157
158 namespace detail
159 {
160 template <>
162 {
163 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
164 {
166 }
167 };
168
169 template <>
171 {
172 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
173 {
175 }
176 };
177
178 template <>
180 {
181 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
182 {
184 }
185 };
186
187 template <>
189 {
190 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
191 {
193 }
194 };
195 }
196
200 template <class T>
201 constexpr bool is_string_array_v = std::same_as<T, string_array>;
202
206 template <class T>
207 constexpr bool is_big_string_array_v = std::same_as<T, big_string_array>;
208
212 template <class T>
213 constexpr bool is_binary_array_v = std::same_as<T, binary_array>;
214
218 template <class T>
219 constexpr bool is_big_binary_array_v = std::same_as<T, big_binary_array>;
220
221 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
223 {
225
229 using offset_type = OT;
230
231 using data_value_type = typename T::value_type;
232
233 using offset_iterator = OT*;
234 using const_offset_iterator = const OT*;
235
238
239 using iterator_tag = std::random_access_iterator_tag;
240
242
251
253
262
264
265 // using iterator = layout_iterator<array_type, false>;
266 // using const_iterator = layout_iterator<array_type, true, CR>;
267 };
268
277 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
279 : public mutable_array_bitmap_base<variable_size_binary_array_impl<T, CR, OT, Ext>>,
280 public Ext
281 {
282 private:
283
284 static_assert(
285 sizeof(std::ranges::range_value_t<T>) == sizeof(std::uint8_t),
286 "Only sequences of types with the same size as uint8_t are supported"
287 );
288
289 public:
290
293
295 using inner_value_type = typename inner_types::inner_value_type;
296 using inner_reference = typename inner_types::inner_reference;
297 using inner_const_reference = typename inner_types::inner_const_reference;
298
299 using offset_type = typename inner_types::offset_type;
304
306 using bitmap_reference = typename base_type::bitmap_reference;
309
313
314 using offset_iterator = typename inner_types::offset_iterator;
315 using const_offset_iterator = typename inner_types::const_offset_iterator;
316
320 using data_iterator = typename inner_types::data_iterator;
321
322 using const_data_iterator = typename inner_types::const_data_iterator;
323 using data_value_type = typename inner_types::data_value_type;
324
325 using value_iterator = typename inner_types::value_iterator;
326 using const_value_iterator = typename inner_types::const_value_iterator;
327
345
359 template <class... ARGS>
362 : self_type(create_proxy(std::forward<ARGS>(args)...))
363 {
364 }
365
366 using base_type::get_arrow_proxy;
367 using base_type::size;
368
381 [[nodiscard]] constexpr inner_reference value(size_type i);
382
398 [[nodiscard]] constexpr inner_const_reference value(size_type i) const;
399
416 template <std::ranges::range SIZES_RANGE>
417 [[nodiscard]] static constexpr auto offset_from_sizes(SIZES_RANGE&& sizes) -> offset_buffer_type;
418
419 private:
420
441 template <
444 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
445 [[nodiscard]] static arrow_proxy create_proxy(
446 u8_buffer<C>&& data_buffer,
447 offset_buffer_type&& list_offsets,
449 std::optional<std::string_view> name = std::nullopt,
450 std::optional<METADATA_RANGE> metadata = std::nullopt
451 );
452
471 template <
472 std::ranges::input_range R,
474 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
475 requires(
476 std::ranges::input_range<std::ranges::range_value_t<R>>
478 )
479 [[nodiscard]] static arrow_proxy create_proxy(
480 R&& values,
482 std::optional<std::string_view> name = std::nullopt,
483 std::optional<METADATA_RANGE> metadata = std::nullopt
484 );
485
503 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
504 requires(
505 std::ranges::input_range<std::ranges::range_value_t<R>>
506 && mpl::char_like<std::ranges::range_value_t<std::ranges::range_value_t<R>>>
507 )
508 [[nodiscard]] static arrow_proxy create_proxy(
509 R&& values,
510 bool nullable,
511 std::optional<std::string_view> name = std::nullopt,
512 std::optional<METADATA_RANGE> metadata = std::nullopt
513 );
514
529 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
530 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
531 [[nodiscard]] static arrow_proxy create_proxy(
532 R&&,
533 std::optional<std::string_view> name = std::nullopt,
534 std::optional<METADATA_RANGE> metadata = std::nullopt
535 );
536
554 template <mpl::char_like C, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
555 [[nodiscard]] static arrow_proxy create_proxy_impl(
556 u8_buffer<C>&& data_buffer,
557 offset_buffer_type&& list_offsets,
558 std::optional<validity_bitmap>&&,
559 std::optional<std::string_view> name = std::nullopt,
560 std::optional<METADATA_RANGE> metadata = std::nullopt
561 );
562
563 static constexpr size_t OFFSET_BUFFER_INDEX = 1;
564 static constexpr size_t DATA_BUFFER_INDEX = 2;
565
577 [[nodiscard]] constexpr offset_iterator offset(size_type i);
578
586 [[nodiscard]] constexpr offset_iterator offsets_begin();
587
595 [[nodiscard]] constexpr offset_iterator offsets_end();
596
608 [[nodiscard]] constexpr data_iterator data(size_type i);
609
617 [[nodiscard]] constexpr value_iterator value_begin();
618
626 [[nodiscard]] constexpr value_iterator value_end();
627
635 [[nodiscard]] constexpr const_value_iterator value_cbegin() const;
636
644 [[nodiscard]] constexpr const_value_iterator value_cend() const;
645
657 [[nodiscard]] constexpr const_offset_iterator offset(size_type i) const;
658
666 [[nodiscard]] constexpr const_offset_iterator offsets_cbegin() const;
667
675 [[nodiscard]] constexpr const_offset_iterator offsets_cend() const;
676
688 [[nodiscard]] constexpr const_data_iterator data(size_type i) const;
689
690 // Modifiers
691
704 template <std::ranges::sized_range U>
705 requires mpl::convertible_ranges<U, T>
706 constexpr void resize_values(size_type new_length, U value);
707
717 constexpr void resize_offsets(size_type new_length, offset_type offset_value);
718
734 template <std::ranges::sized_range U>
735 requires mpl::convertible_ranges<U, T>
736 constexpr value_iterator insert_value(const_value_iterator pos, U value, size_type count);
737
750 constexpr offset_iterator insert_offset(const_offset_iterator pos, offset_type size, size_type count);
751
768 template <mpl::iterator_of_type<T> InputIt>
769 constexpr value_iterator insert_values(const_value_iterator pos, InputIt first, InputIt last);
770
791 template <mpl::iterator_of_type<OT> InputIt>
792 constexpr offset_iterator
793 insert_offsets(const_offset_iterator pos, InputIt first_sizes, InputIt last_sizes);
794
812 constexpr value_iterator erase_values(const_value_iterator pos, size_type count);
813
830 constexpr offset_iterator erase_offsets(const_offset_iterator pos, size_type count);
831
847 template <std::ranges::sized_range U>
848 requires mpl::convertible_ranges<U, T>
849 constexpr void assign(U&& rhs, size_type index);
850
860 constexpr void check_offset_overflow(offset_type current_offset, offset_type size_to_add) const;
861
864 friend base_type;
867 };
868
869 /*********************************************
870 * variable_size_binary_array_impl implementation *
871 *********************************************/
872
873 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
875 : base_type(std::move(proxy))
876 {
877 const auto type = this->get_arrow_proxy().data_type();
880 || type == data_type::LARGE_BINARY
881 );
883 (((type == data_type::STRING || type == data_type::BINARY) && std::same_as<OT, int32_t>)
884 || ((type == data_type::LARGE_STRING || type == data_type::LARGE_BINARY)
885 && std::same_as<OT, int64_t>) )
886 );
887 }
888
889 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
890 template <std::ranges::range SIZES_RANGE>
893 {
895 std::forward<SIZES_RANGE>(sizes)
896 );
897 }
898
899 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
900 template <mpl::char_like C, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
901 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
902 u8_buffer<C>&& data_buffer,
903 offset_buffer_type&& offsets,
904 VB&& validity_input,
905 std::optional<std::string_view> name,
906 std::optional<METADATA_RANGE> metadata
907 )
908 {
909 const auto size = offsets.size() - 1;
910 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
911 const auto null_count = vbitmap.null_count();
912
915 std::move(name), // name
916 std::move(metadata), // metadata
917 std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE}), // flags,
918 nullptr, // children
919 repeat_view<bool>(true, 0),
920 nullptr, // dictionary
921 true
922
923 );
924 std::vector<buffer<std::uint8_t>> arr_buffs;
925 arr_buffs.reserve(3);
926 arr_buffs.emplace_back(std::move(vbitmap).extract_storage());
927 arr_buffs.emplace_back(std::move(offsets).extract_storage());
928 arr_buffs.emplace_back(std::move(data_buffer).extract_storage());
929
931 static_cast<std::int64_t>(size), // length
932 static_cast<int64_t>(null_count),
933 0, // offset
934 std::move(arr_buffs),
935 nullptr, // children
936 repeat_view<bool>(true, 0),
937 nullptr, // dictionary
938 true
939 );
940 return arrow_proxy{std::move(arr), std::move(schema)};
941 }
942
943 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
944 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
945 requires(
946 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
948 // range of char-like
949 )
950 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
951 R&& values,
952 VB&& validity_input,
953 std::optional<std::string_view> name,
954 std::optional<METADATA_RANGE> metadata
955 )
956 {
957 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
958
959 auto size_range = values
960 | std::views::transform(
961 [](const auto& v)
962 {
963 return std::ranges::size(v);
964 }
965 );
966 auto offset_buffer = offset_from_sizes(size_range);
967 auto data_buffer = u8_buffer<values_inner_value_type>(std::ranges::views::join(values));
968 return create_proxy(
969 std::move(data_buffer),
970 std::move(offset_buffer),
971 std::forward<VB>(validity_input),
972 std::forward<std::optional<std::string_view>>(name),
973 std::forward<std::optional<METADATA_RANGE>>(metadata)
974 );
975 }
976
977 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
978 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
979 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
980 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
981 R&& range,
982 std::optional<std::string_view> name,
983 std::optional<METADATA_RANGE> metadata
984 )
985 {
986 // split into values and is_non_null ranges
987 const auto values = range
988 | std::views::transform(
989 [](const auto& v)
990 {
991 return v.get();
992 }
993 );
994 const auto is_non_null = range
995 | std::views::transform(
996 [](const auto& v)
997 {
998 return v.has_value();
999 }
1000 );
1001 return self_type::create_proxy(values, is_non_null, std::move(name), std::move(metadata));
1002 }
1003
1004 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1005 template <
1006 std::ranges::input_range R,
1007 input_metadata_container METADATA_RANGE>
1008 requires(
1009 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
1011 // range of
1012 // char-like
1013 )
1014 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
1015 R&& values,
1016 bool nullable,
1017 std::optional<std::string_view> name,
1018 std::optional<METADATA_RANGE> metadata
1019 )
1020 {
1021 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
1022 const size_t size = std::ranges::size(values);
1023 u8_buffer<values_inner_value_type> data_buffer(std::ranges::views::join(values));
1024 auto size_range = values
1025 | std::views::transform(
1026 [](const auto& v)
1027 {
1028 return std::ranges::size(v);
1029 }
1030 );
1031 auto offset_buffer = offset_from_sizes(size_range);
1032 return create_proxy_impl(
1033 std::move(data_buffer),
1034 std::move(offset_buffer),
1035 nullable ? std::make_optional<validity_bitmap>(nullptr, size, validity_bitmap::default_allocator())
1036 : std::nullopt,
1037 std::move(name),
1038 std::move(metadata)
1039 );
1040 }
1041
1042 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1043 template <mpl::char_like C, input_metadata_container METADATA_RANGE>
1044 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy_impl(
1045 u8_buffer<C>&& data_buffer,
1046 offset_buffer_type&& list_offsets,
1047 std::optional<validity_bitmap>&& bitmap,
1048 std::optional<std::string_view> name,
1049 std::optional<METADATA_RANGE> metadata
1050 )
1051 {
1052 const auto size = list_offsets.size() - 1;
1053 const auto null_count = bitmap.has_value() ? bitmap->null_count() : 0;
1054
1055 const std::optional<std::unordered_set<sparrow::ArrowFlag>>
1056 flags = bitmap.has_value()
1057 ? std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE})
1058 : std::nullopt;
1059
1060 ArrowSchema schema = make_arrow_schema(
1062 std::move(name), // name
1063 std::move(metadata), // metadata
1064 flags, // flags,
1065 nullptr, // children
1066 repeat_view<bool>(true, 0),
1067 nullptr, // dictionary
1068 true
1069
1070 );
1071 std::vector<buffer<std::uint8_t>> arr_buffs;
1072 arr_buffs.reserve(3);
1073 arr_buffs.emplace_back(
1074 bitmap.has_value() ? std::move(*bitmap).extract_storage()
1075 : buffer<std::uint8_t>{nullptr, 0, buffer<std::uint8_t>::default_allocator()}
1076 );
1077 arr_buffs.emplace_back(std::move(list_offsets).extract_storage());
1078 arr_buffs.emplace_back(std::move(data_buffer).extract_storage());
1079
1080 ArrowArray arr = make_arrow_array(
1081 static_cast<std::int64_t>(size), // length
1082 static_cast<int64_t>(null_count),
1083 0, // offset
1084 std::move(arr_buffs),
1085 nullptr, // children
1086 repeat_view<bool>(true, 0),
1087 nullptr, // dictionary
1088 true
1089 );
1090 arrow_proxy proxy{std::move(arr), std::move(schema)};
1091 Ext::init(proxy);
1092 return proxy;
1093 }
1094
1095 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1096 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::data(size_type i) -> data_iterator
1097 {
1098 arrow_proxy& proxy = get_arrow_proxy();
1099 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
1100 return proxy.buffers()[DATA_BUFFER_INDEX].template data<data_value_type>() + i;
1101 }
1102
1103 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1104 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::data(size_type i) const
1105 -> const_data_iterator
1106 {
1107 const arrow_proxy& proxy = this->get_arrow_proxy();
1108 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
1109 return proxy.buffers()[DATA_BUFFER_INDEX].template data<const data_value_type>() + i;
1110 }
1111
1112 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1113 template <std::ranges::sized_range U>
1115 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::assign(U&& rhs, size_type index)
1116 {
1117 SPARROW_ASSERT_TRUE(index < size());
1118 const auto offset_beg = *offset(index);
1119 const auto offset_end = *offset(index + 1);
1120 const auto initial_value_length = offset_end - offset_beg;
1121 const auto new_value_length = static_cast<OT>(std::ranges::size(rhs));
1122 const OT shift_byte_count = new_value_length - initial_value_length;
1123 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1124 if (shift_byte_count != 0)
1125 {
1126 // Check for offset overflow before adjusting
1127 if (shift_byte_count > 0)
1128 {
1129 const offset_type last_offset = *offset(size());
1130 check_offset_overflow(last_offset, shift_byte_count);
1131 }
1132
1133 const auto shift_val_abs = static_cast<size_t>(std::abs(shift_byte_count));
1134 const auto new_data_buffer_size = shift_byte_count < 0 ? data_buffer.size() - shift_val_abs
1135 : data_buffer.size() + shift_val_abs;
1136
1137 if (shift_byte_count > 0)
1138 {
1139 data_buffer.resize(new_data_buffer_size);
1140 // Move elements to make space for the new value
1141 std::move_backward(
1142 sparrow::next(data_buffer.begin(), offset_end),
1143 sparrow::next(data_buffer.end(), -shift_byte_count),
1144 data_buffer.end()
1145 );
1146 }
1147 else
1148 {
1149 std::move(
1150 sparrow::next(data_buffer.begin(), offset_end),
1151 data_buffer.end(),
1152 sparrow::next(data_buffer.begin(), offset_end + shift_byte_count)
1153 );
1154 data_buffer.resize(new_data_buffer_size);
1155 }
1156 // Adjust offsets for subsequent elements
1157 std::for_each(
1158 offset(index + 1),
1159 offset(size() + 1),
1160 [shift_byte_count](auto& offset)
1161 {
1162 offset += shift_byte_count;
1163 }
1164 );
1165 }
1166 auto tmp = std::views::transform(
1167 rhs,
1168 [](const auto& val)
1169 {
1170 return static_cast<std::uint8_t>(val);
1171 }
1172 );
1173 // Copy the new value into the buffer
1174 std::copy(std::ranges::begin(tmp), std::ranges::end(tmp), sparrow::next(data_buffer.begin(), offset_beg));
1175 }
1176
1177 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1178 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::check_offset_overflow(
1179 offset_type current_offset,
1180 offset_type size_to_add
1181 ) const
1182 {
1183 constexpr offset_type max_offset = std::numeric_limits<offset_type>::max();
1184 if (current_offset > max_offset - size_to_add)
1185 {
1186 throw std::overflow_error("Offset overflow: adding elements would exceed maximum offset value");
1187 }
1188 }
1189
1190 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1191 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offset(size_type i) -> offset_iterator
1192 {
1193 SPARROW_ASSERT_TRUE(i <= size() + this->get_arrow_proxy().offset());
1194 return get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
1195 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
1196 }
1197
1198 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1199 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offset(size_type i) const
1200 -> const_offset_iterator
1201 {
1202 SPARROW_ASSERT_TRUE(i <= this->size() + this->get_arrow_proxy().offset());
1203 return this->get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
1204 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
1205 }
1206
1207 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1208 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_begin() -> offset_iterator
1209 {
1210 return offset(0);
1211 }
1212
1213 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1214 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_cbegin() const
1215 -> const_offset_iterator
1216 {
1217 return offset(0);
1218 }
1219
1220 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1221 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_end() -> offset_iterator
1222 {
1223 return offset(size() + 1);
1224 }
1225
1226 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1227 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_cend() const
1228 -> const_offset_iterator
1229 {
1230 return offset(size() + 1);
1231 }
1232
1233 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1235 {
1236 SPARROW_ASSERT_TRUE(i < size());
1237 return inner_reference(this, i);
1238 }
1239
1240 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1243 {
1244 SPARROW_ASSERT_TRUE(i < this->size());
1245 const OT offset_begin = *offset(i);
1246 SPARROW_ASSERT_TRUE(offset_begin >= 0);
1247 const OT offset_end = *offset(i + 1);
1248 SPARROW_ASSERT_TRUE(offset_end >= 0);
1249 const const_data_iterator pointer_begin = data(static_cast<size_t>(offset_begin));
1250 const const_data_iterator pointer_end = data(static_cast<size_t>(offset_end));
1251 return inner_const_reference(pointer_begin, pointer_end);
1252 }
1253
1254 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1255 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_begin() -> value_iterator
1256 {
1257 return value_iterator{this, 0};
1258 }
1259
1260 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1261 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_end() -> value_iterator
1262 {
1263 return sparrow::next(value_begin(), size());
1264 }
1265
1266 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1267 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_cbegin() const -> const_value_iterator
1268 {
1269 return const_value_iterator{this, 0};
1270 }
1271
1272 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1273 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_cend() const -> const_value_iterator
1274 {
1275 return sparrow::next(value_cbegin(), this->size());
1276 }
1277
1278 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1279 template <std::ranges::sized_range U>
1281 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::resize_values(size_type new_length, U value)
1282 {
1283 const size_t new_size = new_length + static_cast<size_t>(this->get_arrow_proxy().offset());
1284 auto& buffers = this->get_arrow_proxy().get_array_private_data()->buffers();
1285 if (new_length < size())
1286 {
1287 const auto offset_begin = static_cast<size_t>(*offset(new_length));
1288 auto& data_buffer = buffers[DATA_BUFFER_INDEX];
1289 data_buffer.resize(offset_begin);
1290 auto& offset_buffer = buffers[OFFSET_BUFFER_INDEX];
1291 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1292 offset_buffer_adaptor.resize(new_size + 1);
1293 }
1294 else if (new_length > size())
1295 {
1296 insert_value(value_cend(), value, new_length - size());
1297 }
1298 }
1299
1300 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1301 template <std::ranges::sized_range U>
1303 constexpr auto
1304 variable_size_binary_array_impl<T, CR, OT, Ext>::insert_value(const_value_iterator pos, U value, size_type count)
1306 {
1307 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
1308 const OT offset_begin = *offset(idx);
1309 const std::vector<uint8_t> casted_value{value.cbegin(), value.cend()};
1310 const repeat_view<std::vector<uint8_t>> my_repeat_view{casted_value, count};
1311 const auto joined_repeated_value_range = std::ranges::views::join(my_repeat_view);
1312 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1313 const auto pos_to_insert = sparrow::next(data_buffer.cbegin(), offset_begin);
1314 data_buffer.insert(pos_to_insert, joined_repeated_value_range.begin(), joined_repeated_value_range.end());
1315 insert_offset(offsets_cbegin() + idx + 1, static_cast<offset_type>(value.size()), count);
1316 return sparrow::next(value_begin(), idx);
1317 }
1318
1319 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1320 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_offset(
1321 const_offset_iterator pos,
1322 offset_type value_size,
1323 size_type count
1324 ) -> offset_iterator
1325 {
1326 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1327 const auto idx = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
1328 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1329 const offset_type cumulative_size = value_size * static_cast<offset_type>(count);
1330
1331 // Check for offset overflow before adjusting
1332 if (!offset_buffer_adaptor.empty())
1333 {
1334 const offset_type last_offset = offset_buffer_adaptor.back();
1335 check_offset_overflow(last_offset, cumulative_size);
1336 }
1337
1338 // Adjust offsets for subsequent elements
1339 std::for_each(
1340 sparrow::next(offset_buffer_adaptor.begin(), idx + 1),
1341 offset_buffer_adaptor.end(),
1342 [cumulative_size](auto& offset)
1343 {
1344 offset += cumulative_size;
1345 }
1346 );
1347 offset_buffer_adaptor.insert(sparrow::next(offset_buffer_adaptor.cbegin(), idx + 1), count, 0);
1348 // Put the right values in the new offsets
1349 for (size_t i = idx + 1; i < idx + 1 + count; ++i)
1350 {
1351 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + value_size;
1352 }
1353 return offsets_begin() + idx;
1354 }
1355
1356 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1357 template <mpl::iterator_of_type<T> InputIt>
1358 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_values(
1360 InputIt first,
1361 InputIt last
1362 ) -> value_iterator
1363 {
1364 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1365 auto data_buffer_adaptor = make_buffer_adaptor<data_value_type>(data_buffer);
1366 auto values = std::ranges::subrange(first, last);
1367 const size_t cumulative_sizes = std::accumulate(
1368 values.begin(),
1369 values.end(),
1370 size_t(0),
1371 [](size_t acc, const T& value)
1372 {
1373 return acc + value.size();
1374 }
1375 );
1376 data_buffer_adaptor.resize(data_buffer_adaptor.size() + cumulative_sizes);
1377 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
1378 const OT offset_begin = *offset(idx);
1379 auto insert_pos = sparrow::next(data_buffer_adaptor.begin(), offset_begin);
1380
1381 // Move elements to make space for the new value
1382 std::move_backward(
1383 insert_pos,
1384 sparrow::next(data_buffer_adaptor.end(), -static_cast<difference_type>(cumulative_sizes)),
1385 data_buffer_adaptor.end()
1386 );
1387
1388 for (const T& value : values)
1389 {
1390 std::copy(value.begin(), value.end(), insert_pos);
1391 std::advance(insert_pos, value.size());
1392 }
1393
1394 const auto sizes_of_each_value = std::ranges::views::transform(
1395 values,
1396 [](const T& value) -> offset_type
1397 {
1398 return static_cast<offset_type>(value.size());
1399 }
1400 );
1401 insert_offsets(offset(idx + 1), sizes_of_each_value.begin(), sizes_of_each_value.end());
1402 return sparrow::next(value_begin(), idx);
1403 }
1404
1405 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1406 template <mpl::iterator_of_type<OT> InputIt>
1407 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_offsets(
1409 InputIt first_sizes,
1410 InputIt last_sizes
1411 ) -> offset_iterator
1412 {
1413 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
1414 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
1415 SPARROW_ASSERT_TRUE(first_sizes <= last_sizes);
1416 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1417 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1418 const auto idx = std::distance(offsets_cbegin(), pos);
1419 const OT cumulative_sizes = std::reduce(first_sizes, last_sizes, OT(0));
1420
1421 // Check for offset overflow before adjusting
1422 if (!offset_buffer_adaptor.empty())
1423 {
1424 const offset_type last_offset = offset_buffer_adaptor.back();
1425 check_offset_overflow(last_offset, cumulative_sizes);
1426 }
1427
1428 const auto sizes_count = std::distance(first_sizes, last_sizes);
1429 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() + static_cast<size_t>(sizes_count));
1430 // Move the offsets to make space for the new offsets
1431 std::move_backward(
1432 offset_buffer_adaptor.begin() + idx,
1433 offset_buffer_adaptor.end() - sizes_count,
1434 offset_buffer_adaptor.end()
1435 );
1436 // Adjust offsets for subsequent elements
1437 std::for_each(
1438 offset_buffer_adaptor.begin() + idx + sizes_count,
1439 offset_buffer_adaptor.end(),
1440 [cumulative_sizes](auto& offset)
1441 {
1442 offset += cumulative_sizes;
1443 }
1444 );
1445 // Put the right values in the new offsets
1446 InputIt it = first_sizes;
1447 for (size_t i = static_cast<size_t>(idx + 1); i < static_cast<size_t>(idx + sizes_count + 1); ++i)
1448 {
1449 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + *it;
1450 ++it;
1451 }
1452 return offset(static_cast<size_t>(idx));
1453 }
1454
1455 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1456 constexpr auto
1457 variable_size_binary_array_impl<T, CR, OT, Ext>::erase_values(const_value_iterator pos, size_type count)
1458 -> value_iterator
1459 {
1460 SPARROW_ASSERT_TRUE(pos >= value_cbegin());
1461 SPARROW_ASSERT_TRUE(pos <= value_cend());
1462 const size_t index = static_cast<size_t>(std::distance(value_cbegin(), pos));
1463 if (count == 0)
1464 {
1465 return sparrow::next(value_begin(), index);
1466 }
1467 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1468 const auto offset_begin = *offset(index);
1469 const auto offset_end = *offset(index + count);
1470 const size_t difference = static_cast<size_t>(offset_end - offset_begin);
1471 // move the values after the erased ones
1472 std::move(data_buffer.begin() + offset_end, data_buffer.end(), data_buffer.begin() + offset_begin);
1473 data_buffer.resize(data_buffer.size() - difference);
1474 // adjust the offsets for the subsequent elements
1475 erase_offsets(offset(index), count);
1476 return sparrow::next(value_begin(), index);
1477 }
1478
1479 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1480 constexpr auto
1481 variable_size_binary_array_impl<T, CR, OT, Ext>::erase_offsets(const_offset_iterator pos, size_type count)
1482 -> offset_iterator
1483 {
1484 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
1485 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
1486 const size_t index = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
1487 if (count == 0)
1488 {
1489 return offset(index);
1490 }
1491 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1492 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1493 const OT offset_start_value = *offset(index);
1494 const OT offset_end_value = *offset(index + count);
1495 const OT difference = offset_end_value - offset_start_value;
1496 // move the offsets after the erased ones
1497 std::move(
1498 sparrow::next(offset_buffer_adaptor.begin(), index + count + 1),
1499 offset_buffer_adaptor.end(),
1500 sparrow::next(offset_buffer_adaptor.begin(), index + 1)
1501 );
1502 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() - count);
1503 // adjust the offsets for the subsequent elements
1504 std::for_each(
1505 sparrow::next(offset_buffer_adaptor.begin(), index + 1),
1506 offset_buffer_adaptor.end(),
1507 [difference](OT& offset)
1508 {
1509 offset -= difference;
1510 }
1511 );
1512 return offset(index);
1513 }
1514
1515}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::iterator_tag iterator_tag
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
Object that owns a piece of contiguous memory.
Definition buffer.hpp:114
constexpr size_type null_count() const noexcept
Returns the number of bits set to false (null/invalid).
typename storage_type::default_allocator default_allocator
A view that repeats a value a given number of times.
This buffer class is used as storage buffer for all sparrow arrays.
static constexpr auto offset_from_sizes(SIZES_RANGE &&sizes) -> offset_buffer_type
Creates offset buffer from a range of sizes.
constexpr inner_reference value(size_type i)
Gets mutable reference to element at specified index.
variable_size_binary_array_impl(ARGS &&... args)
Generic constructor for creating array from various inputs.
constexpr inner_const_reference value(size_type i) const
Gets const reference to element at specified index.
variable_size_binary_array_impl(arrow_proxy)
Constructs array from Arrow proxy.
Iterator over the data values of a variable size binary layout.
Concept for input containers that can provide metadata pairs.
Definition metadata.hpp:332
Concept for iterator types.
Concept for character-like types.
Concept for convertible range types.
Definition mp_utils.hpp:931
Concept defining valid input types for validity bitmap creation.
#define SPARROW_CONSTEXPR_GCC_11
Definition config.hpp:50
#define SPARROW_ASSERT_TRUE(expr__)
constexpr sparrow::u8_buffer< OFFSET_TYPE > offset_buffer_from_sizes(SIZES_RANGE &&sizes)
constexpr std::size_t size(typelist< T... >={})
Gets the count of types contained in a typelist.
Definition mp_utils.hpp:216
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
binary_array_impl< std::int64_t > big_binary_array
Type alias for variable-size binary arrays with 64-bit offsets.
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient alias for arrays with mutable validity bitmaps.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_big_string_array_v
Checks whether T is a big_string_array type.
constexpr bool is_string_array_v
Checks whether T is a string_array type.
variable_size_binary_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference, OT, Ext > binary_array_impl
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
string_array_impl< std::int64_t > big_string_array
Type alias for variable-size string arrays with 64-bit offsets.
string_array_impl< std::int32_t > string_array
Type alias for variable-size string arrays with 32-bit offsets.
constexpr bool is_big_binary_array_v
Checks whether T is a big_binary_array type.
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
Type alias for a validity bitmap using 8-bit storage blocks.
variable_size_binary_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference, OT, Ext > string_array_impl
constexpr bool is_binary_array_v
Checks whether T is a binary_array type.
binary_array_impl< std::int32_t > binary_array
Type alias for variable-size binary arrays with 32-bit offsets.
auto make_buffer_adaptor(FromBufferRef &buf)
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
Ensures a validity bitmap of the specified size from various input types.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
Extensions to the C++ standard library.
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
variable_size_binary_value_iterator< array_type, iterator_types > value_iterator
variable_size_binary_value_iterator< array_type, const_iterator_types > const_value_iterator
Base class for array_inner_types specializations.
Traits class that must be specialized by array implementations.
Provides compile-time information about Arrow data types.
Metafunction for retrieving the data_type of a typed array.