sparrow 2.2.1
C++20 idiomatic APIs for the Apache Arrow Columnar Format
Loading...
Searching...
No Matches
variable_size_binary_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <limits>
20#include <numeric>
21#include <optional>
22#include <ranges>
23#include <stdexcept>
24#include <string>
25#include <vector>
26
41
42namespace sparrow
43{
44 namespace detail
45 {
46 template <class T, class OT>
48
49 template <>
50 struct variable_size_binary_format<std::string, std::int32_t>
51 {
52 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
53 {
54 return "u";
55 }
56 };
57
58 template <>
59 struct variable_size_binary_format<std::string, std::int64_t>
60 {
61 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
62 {
63 return "U";
64 }
65 };
66
67 template <>
68 struct variable_size_binary_format<std::vector<byte_t>, std::int32_t>
69 {
70 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
71 {
72 return "z";
73 }
74 };
75
76 template <>
77 struct variable_size_binary_format<std::vector<byte_t>, std::int64_t>
78 {
79 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
80 {
81 return "Z";
82 }
83 };
84 }
85
86
87 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext = empty_extension>
88 class variable_size_binary_array_impl;
89
90 namespace copy_tracker
91 {
92 template <typename T>
94 std::string key()
95 {
96 return "variable_size_binary_array";
97 }
98 }
99
100 template <layout_offset OT, typename Ext = empty_extension>
104 OT,
105 Ext>;
106
107 template <layout_offset OT, typename Ext = empty_extension>
110 arrow_traits<std::vector<byte_t>>::const_reference,
111 OT,
112 Ext>;
113
127
141
155
169
170 namespace detail
171 {
172 template <>
174 {
175 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
176 {
178 }
179 };
180
181 template <>
183 {
184 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
185 {
187 }
188 };
189
190 template <>
192 {
193 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
194 {
196 }
197 };
198
199 template <>
201 {
202 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
203 {
205 }
206 };
207 }
208
212 template <class T>
213 constexpr bool is_string_array_v = std::same_as<T, string_array>;
214
218 template <class T>
219 constexpr bool is_big_string_array_v = std::same_as<T, big_string_array>;
220
224 template <class T>
225 constexpr bool is_binary_array_v = std::same_as<T, binary_array>;
226
230 template <class T>
231 constexpr bool is_big_binary_array_v = std::same_as<T, big_binary_array>;
232
233 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
235 {
237
241 using offset_type = OT;
242
243 using data_value_type = typename T::value_type;
244
245 using offset_iterator = OT*;
246 using const_offset_iterator = const OT*;
247
250
251 using iterator_tag = std::random_access_iterator_tag;
252
254
263
265
274
276
277 // using iterator = layout_iterator<array_type, false>;
278 // using const_iterator = layout_iterator<array_type, true, CR>;
279 };
280
289 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
291 : public mutable_array_bitmap_base<variable_size_binary_array_impl<T, CR, OT, Ext>>,
292 public Ext
293 {
294 private:
295
296 static_assert(
297 sizeof(std::ranges::range_value_t<T>) == sizeof(std::uint8_t),
298 "Only sequences of types with the same size as uint8_t are supported"
299 );
300
301 public:
302
305
307 using inner_value_type = typename inner_types::inner_value_type;
308 using inner_reference = typename inner_types::inner_reference;
309 using inner_const_reference = typename inner_types::inner_const_reference;
310
311 using offset_type = typename inner_types::offset_type;
316
318 using bitmap_reference = typename base_type::bitmap_reference;
321
325
326 using offset_iterator = typename inner_types::offset_iterator;
327 using const_offset_iterator = typename inner_types::const_offset_iterator;
328
332 using data_iterator = typename inner_types::data_iterator;
333
334 using const_data_iterator = typename inner_types::const_data_iterator;
335 using data_value_type = typename inner_types::data_value_type;
336
337 using value_iterator = typename inner_types::value_iterator;
338 using const_value_iterator = typename inner_types::const_value_iterator;
339
357
367
375
382
390
404 template <class... ARGS>
407 : self_type(create_proxy(std::forward<ARGS>(args)...))
408 {
409 }
410
411 using base_type::get_arrow_proxy;
412 using base_type::size;
413
426 [[nodiscard]] constexpr inner_reference value(size_type i);
427
443 [[nodiscard]] constexpr inner_const_reference value(size_type i) const;
444
461 template <std::ranges::range SIZES_RANGE>
462 [[nodiscard]] static constexpr auto offset_from_sizes(SIZES_RANGE&& sizes) -> offset_buffer_type;
463
464 private:
465
486 template <
489 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
490 [[nodiscard]] static arrow_proxy create_proxy(
491 u8_buffer<C>&& data_buffer,
492 offset_buffer_type&& list_offsets,
494 std::optional<std::string_view> name = std::nullopt,
495 std::optional<METADATA_RANGE> metadata = std::nullopt
496 );
497
516 template <
517 std::ranges::input_range R,
519 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
520 requires(
521 std::ranges::input_range<std::ranges::range_value_t<R>>
523 )
524 [[nodiscard]] static arrow_proxy create_proxy(
525 R&& values,
527 std::optional<std::string_view> name = std::nullopt,
528 std::optional<METADATA_RANGE> metadata = std::nullopt
529 );
530
548 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
549 requires(
550 std::ranges::input_range<std::ranges::range_value_t<R>>
551 && mpl::char_like<std::ranges::range_value_t<std::ranges::range_value_t<R>>>
552 )
553 [[nodiscard]] static arrow_proxy create_proxy(
554 R&& values,
555 bool nullable,
556 std::optional<std::string_view> name = std::nullopt,
557 std::optional<METADATA_RANGE> metadata = std::nullopt
558 );
559
574 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
575 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
576 [[nodiscard]] static arrow_proxy create_proxy(
577 R&&,
578 std::optional<std::string_view> name = std::nullopt,
579 std::optional<METADATA_RANGE> metadata = std::nullopt
580 );
581
599 template <mpl::char_like C, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
600 [[nodiscard]] static arrow_proxy create_proxy_impl(
601 u8_buffer<C>&& data_buffer,
602 offset_buffer_type&& list_offsets,
603 std::optional<validity_bitmap>&&,
604 std::optional<std::string_view> name = std::nullopt,
605 std::optional<METADATA_RANGE> metadata = std::nullopt
606 );
607
608 static constexpr size_t OFFSET_BUFFER_INDEX = 1;
609 static constexpr size_t DATA_BUFFER_INDEX = 2;
610
622 [[nodiscard]] constexpr offset_iterator offset(size_type i);
623
631 [[nodiscard]] constexpr offset_iterator offsets_begin();
632
640 [[nodiscard]] constexpr offset_iterator offsets_end();
641
653 [[nodiscard]] constexpr data_iterator data(size_type i);
654
662 [[nodiscard]] constexpr value_iterator value_begin();
663
671 [[nodiscard]] constexpr value_iterator value_end();
672
680 [[nodiscard]] constexpr const_value_iterator value_cbegin() const;
681
689 [[nodiscard]] constexpr const_value_iterator value_cend() const;
690
702 [[nodiscard]] constexpr const_offset_iterator offset(size_type i) const;
703
711 [[nodiscard]] constexpr const_offset_iterator offsets_cbegin() const;
712
720 [[nodiscard]] constexpr const_offset_iterator offsets_cend() const;
721
733 [[nodiscard]] constexpr const_data_iterator data(size_type i) const;
734
735 // Modifiers
736
749 template <std::ranges::sized_range U>
750 requires mpl::convertible_ranges<U, T>
751 constexpr void resize_values(size_type new_length, U value);
752
762 constexpr void resize_offsets(size_type new_length, offset_type offset_value);
763
779 template <std::ranges::sized_range U>
780 requires mpl::convertible_ranges<U, T>
781 constexpr value_iterator insert_value(const_value_iterator pos, U value, size_type count);
782
795 constexpr offset_iterator insert_offset(const_offset_iterator pos, offset_type size, size_type count);
796
813 template <mpl::iterator_of_type<T> InputIt>
814 constexpr value_iterator insert_values(const_value_iterator pos, InputIt first, InputIt last);
815
836 template <mpl::iterator_of_type<OT> InputIt>
837 constexpr offset_iterator
838 insert_offsets(const_offset_iterator pos, InputIt first_sizes, InputIt last_sizes);
839
857 constexpr value_iterator erase_values(const_value_iterator pos, size_type count);
858
875 constexpr offset_iterator erase_offsets(const_offset_iterator pos, size_type count);
876
892 template <std::ranges::sized_range U>
893 requires mpl::convertible_ranges<U, T>
894 constexpr void assign(U&& rhs, size_type index);
895
905 constexpr void check_offset_overflow(offset_type current_offset, offset_type size_to_add) const;
906
909 friend base_type;
912 };
913
914 /*********************************************
915 * variable_size_binary_array_impl implementation *
916 *********************************************/
917
918 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
920 : base_type(std::move(proxy))
921 {
922 const auto type = this->get_arrow_proxy().data_type();
925 || type == data_type::LARGE_BINARY
926 );
928 (((type == data_type::STRING || type == data_type::BINARY) && std::same_as<OT, int32_t>)
929 || ((type == data_type::LARGE_STRING || type == data_type::LARGE_BINARY)
930 && std::same_as<OT, int64_t>) )
931 );
932 }
933
934 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
942
943 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
951
952 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
953 template <std::ranges::range SIZES_RANGE>
956 {
958 std::forward<SIZES_RANGE>(sizes)
959 );
960 }
961
962 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
963 template <mpl::char_like C, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
964 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
965 u8_buffer<C>&& data_buffer,
966 offset_buffer_type&& offsets,
967 VB&& validity_input,
968 std::optional<std::string_view> name,
969 std::optional<METADATA_RANGE> metadata
970 )
971 {
972 const auto size = offsets.size() - 1;
973 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
974 const auto null_count = vbitmap.null_count();
975
978 std::move(name), // name
979 std::move(metadata), // metadata
980 std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE}), // flags,
981 nullptr, // children
982 repeat_view<bool>(true, 0),
983 nullptr, // dictionary
984 true
985
986 );
987 std::vector<buffer<std::uint8_t>> arr_buffs;
988 arr_buffs.reserve(3);
989 arr_buffs.emplace_back(std::move(vbitmap).extract_storage());
990 arr_buffs.emplace_back(std::move(offsets).extract_storage());
991 arr_buffs.emplace_back(std::move(data_buffer).extract_storage());
992
994 static_cast<std::int64_t>(size), // length
995 static_cast<int64_t>(null_count),
996 0, // offset
997 std::move(arr_buffs),
998 nullptr, // children
999 repeat_view<bool>(true, 0),
1000 nullptr, // dictionary
1001 true
1002 );
1003 return arrow_proxy{std::move(arr), std::move(schema)};
1004 }
1005
1006 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1007 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
1008 requires(
1009 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
1011 // range of char-like
1012 )
1013 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
1014 R&& values,
1015 VB&& validity_input,
1016 std::optional<std::string_view> name,
1017 std::optional<METADATA_RANGE> metadata
1018 )
1019 {
1020 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
1021
1022 auto size_range = values
1023 | std::views::transform(
1024 [](const auto& v)
1025 {
1026 return std::ranges::size(v);
1027 }
1028 );
1029 auto offset_buffer = offset_from_sizes(size_range);
1030 auto data_buffer = u8_buffer<values_inner_value_type>(std::ranges::views::join(values));
1031 return create_proxy(
1032 std::move(data_buffer),
1033 std::move(offset_buffer),
1034 std::forward<VB>(validity_input),
1035 std::forward<std::optional<std::string_view>>(name),
1036 std::forward<std::optional<METADATA_RANGE>>(metadata)
1037 );
1038 }
1039
1040 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1041 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
1042 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
1043 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
1044 R&& range,
1045 std::optional<std::string_view> name,
1046 std::optional<METADATA_RANGE> metadata
1047 )
1048 {
1049 // split into values and is_non_null ranges
1050 const auto values = range
1051 | std::views::transform(
1052 [](const auto& v)
1053 {
1054 return v.get();
1055 }
1056 );
1057 const auto is_non_null = range
1058 | std::views::transform(
1059 [](const auto& v)
1060 {
1061 return v.has_value();
1062 }
1063 );
1064 return self_type::create_proxy(values, is_non_null, std::move(name), std::move(metadata));
1065 }
1066
1067 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1068 template <
1069 std::ranges::input_range R,
1070 input_metadata_container METADATA_RANGE>
1071 requires(
1072 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
1074 // range of
1075 // char-like
1076 )
1077 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
1078 R&& values,
1079 bool nullable,
1080 std::optional<std::string_view> name,
1081 std::optional<METADATA_RANGE> metadata
1082 )
1083 {
1084 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
1085 const size_t size = std::ranges::size(values);
1086 u8_buffer<values_inner_value_type> data_buffer(std::ranges::views::join(values));
1087 auto size_range = values
1088 | std::views::transform(
1089 [](const auto& v)
1090 {
1091 return std::ranges::size(v);
1092 }
1093 );
1094 auto offset_buffer = offset_from_sizes(size_range);
1095 return create_proxy_impl(
1096 std::move(data_buffer),
1097 std::move(offset_buffer),
1098 nullable ? std::make_optional<validity_bitmap>(nullptr, size, validity_bitmap::default_allocator())
1099 : std::nullopt,
1100 std::move(name),
1101 std::move(metadata)
1102 );
1103 }
1104
1105 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1106 template <mpl::char_like C, input_metadata_container METADATA_RANGE>
1107 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy_impl(
1108 u8_buffer<C>&& data_buffer,
1109 offset_buffer_type&& list_offsets,
1110 std::optional<validity_bitmap>&& bitmap,
1111 std::optional<std::string_view> name,
1112 std::optional<METADATA_RANGE> metadata
1113 )
1114 {
1115 const auto size = list_offsets.size() - 1;
1116 const auto null_count = bitmap.has_value() ? bitmap->null_count() : 0;
1117
1118 const std::optional<std::unordered_set<sparrow::ArrowFlag>>
1119 flags = bitmap.has_value()
1120 ? std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE})
1121 : std::nullopt;
1122
1123 ArrowSchema schema = make_arrow_schema(
1125 std::move(name), // name
1126 std::move(metadata), // metadata
1127 flags, // flags,
1128 nullptr, // children
1129 repeat_view<bool>(true, 0),
1130 nullptr, // dictionary
1131 true
1132
1133 );
1134 std::vector<buffer<std::uint8_t>> arr_buffs;
1135 arr_buffs.reserve(3);
1136 arr_buffs.emplace_back(
1137 bitmap.has_value() ? std::move(*bitmap).extract_storage()
1138 : buffer<std::uint8_t>{nullptr, 0, buffer<std::uint8_t>::default_allocator()}
1139 );
1140 arr_buffs.emplace_back(std::move(list_offsets).extract_storage());
1141 arr_buffs.emplace_back(std::move(data_buffer).extract_storage());
1142
1143 ArrowArray arr = make_arrow_array(
1144 static_cast<std::int64_t>(size), // length
1145 static_cast<int64_t>(null_count),
1146 0, // offset
1147 std::move(arr_buffs),
1148 nullptr, // children
1149 repeat_view<bool>(true, 0),
1150 nullptr, // dictionary
1151 true
1152 );
1153 arrow_proxy proxy{std::move(arr), std::move(schema)};
1154 Ext::init(proxy);
1155 return proxy;
1156 }
1157
1158 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1159 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::data(size_type i) -> data_iterator
1160 {
1161 arrow_proxy& proxy = get_arrow_proxy();
1162 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
1163 return proxy.buffers()[DATA_BUFFER_INDEX].template data<data_value_type>() + i;
1164 }
1165
1166 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1167 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::data(size_type i) const
1168 -> const_data_iterator
1169 {
1170 const arrow_proxy& proxy = this->get_arrow_proxy();
1171 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
1172 return proxy.buffers()[DATA_BUFFER_INDEX].template data<const data_value_type>() + i;
1173 }
1174
1175 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1176 template <std::ranges::sized_range U>
1178 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::assign(U&& rhs, size_type index)
1179 {
1180 SPARROW_ASSERT_TRUE(index < size());
1181 const auto offset_beg = *offset(index);
1182 const auto offset_end = *offset(index + 1);
1183 const auto initial_value_length = offset_end - offset_beg;
1184 const auto new_value_length = static_cast<OT>(std::ranges::size(rhs));
1185 const OT shift_byte_count = new_value_length - initial_value_length;
1186 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1187 if (shift_byte_count != 0)
1188 {
1189 // Check for offset overflow before adjusting
1190 if (shift_byte_count > 0)
1191 {
1192 const offset_type last_offset = *offset(size());
1193 check_offset_overflow(last_offset, shift_byte_count);
1194 }
1195
1196 const auto shift_val_abs = static_cast<size_t>(std::abs(shift_byte_count));
1197 const auto new_data_buffer_size = shift_byte_count < 0 ? data_buffer.size() - shift_val_abs
1198 : data_buffer.size() + shift_val_abs;
1199
1200 if (shift_byte_count > 0)
1201 {
1202 data_buffer.resize(new_data_buffer_size);
1203 // Move elements to make space for the new value
1204 std::move_backward(
1205 sparrow::next(data_buffer.begin(), offset_end),
1206 sparrow::next(data_buffer.end(), -shift_byte_count),
1207 data_buffer.end()
1208 );
1209 }
1210 else
1211 {
1212 std::move(
1213 sparrow::next(data_buffer.begin(), offset_end),
1214 data_buffer.end(),
1215 sparrow::next(data_buffer.begin(), offset_end + shift_byte_count)
1216 );
1217 data_buffer.resize(new_data_buffer_size);
1218 }
1219 // Adjust offsets for subsequent elements
1220 std::for_each(
1221 offset(index + 1),
1222 offset(size() + 1),
1223 [shift_byte_count](auto& offset)
1224 {
1225 offset += shift_byte_count;
1226 }
1227 );
1228 }
1229 auto tmp = std::views::transform(
1230 rhs,
1231 [](const auto& val)
1232 {
1233 return static_cast<std::uint8_t>(val);
1234 }
1235 );
1236 // Copy the new value into the buffer
1237 std::copy(std::ranges::begin(tmp), std::ranges::end(tmp), sparrow::next(data_buffer.begin(), offset_beg));
1238 }
1239
1240 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1241 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::check_offset_overflow(
1242 offset_type current_offset,
1243 offset_type size_to_add
1244 ) const
1245 {
1246 constexpr offset_type max_offset = std::numeric_limits<offset_type>::max();
1247 if (current_offset > max_offset - size_to_add)
1248 {
1249 throw std::overflow_error("Offset overflow: adding elements would exceed maximum offset value");
1250 }
1251 }
1252
1253 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1254 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offset(size_type i) -> offset_iterator
1255 {
1256 SPARROW_ASSERT_TRUE(i <= size() + this->get_arrow_proxy().offset());
1257 return get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
1258 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
1259 }
1260
1261 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1262 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offset(size_type i) const
1263 -> const_offset_iterator
1264 {
1265 SPARROW_ASSERT_TRUE(i <= this->size() + this->get_arrow_proxy().offset());
1266 return this->get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
1267 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
1268 }
1269
1270 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1271 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_begin() -> offset_iterator
1272 {
1273 return offset(0);
1274 }
1275
1276 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1277 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_cbegin() const
1278 -> const_offset_iterator
1279 {
1280 return offset(0);
1281 }
1282
1283 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1284 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_end() -> offset_iterator
1285 {
1286 return offset(size() + 1);
1287 }
1288
1289 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1290 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_cend() const
1291 -> const_offset_iterator
1292 {
1293 return offset(size() + 1);
1294 }
1295
1296 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1298 {
1299 SPARROW_ASSERT_TRUE(i < size());
1300 return inner_reference(this, i);
1301 }
1302
1303 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1306 {
1307 SPARROW_ASSERT_TRUE(i < this->size());
1308 const OT offset_begin = *offset(i);
1309 SPARROW_ASSERT_TRUE(offset_begin >= 0);
1310 const OT offset_end = *offset(i + 1);
1311 SPARROW_ASSERT_TRUE(offset_end >= 0);
1312 const const_data_iterator pointer_begin = data(static_cast<size_t>(offset_begin));
1313 const const_data_iterator pointer_end = data(static_cast<size_t>(offset_end));
1314 return inner_const_reference(pointer_begin, pointer_end);
1315 }
1316
1317 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1318 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_begin() -> value_iterator
1319 {
1320 return value_iterator{this, 0};
1321 }
1322
1323 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1324 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_end() -> value_iterator
1325 {
1326 return sparrow::next(value_begin(), size());
1327 }
1328
1329 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1330 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_cbegin() const -> const_value_iterator
1331 {
1332 return const_value_iterator{this, 0};
1333 }
1334
1335 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1336 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_cend() const -> const_value_iterator
1337 {
1338 return sparrow::next(value_cbegin(), this->size());
1339 }
1340
1341 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1342 template <std::ranges::sized_range U>
1344 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::resize_values(size_type new_length, U value)
1345 {
1346 const size_t new_size = new_length + static_cast<size_t>(this->get_arrow_proxy().offset());
1347 auto& buffers = this->get_arrow_proxy().get_array_private_data()->buffers();
1348 if (new_length < size())
1349 {
1350 const auto offset_begin = static_cast<size_t>(*offset(new_length));
1351 auto& data_buffer = buffers[DATA_BUFFER_INDEX];
1352 data_buffer.resize(offset_begin);
1353 auto& offset_buffer = buffers[OFFSET_BUFFER_INDEX];
1354 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1355 offset_buffer_adaptor.resize(new_size + 1);
1356 }
1357 else if (new_length > size())
1358 {
1359 insert_value(value_cend(), value, new_length - size());
1360 }
1361 }
1362
1363 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1364 template <std::ranges::sized_range U>
1366 constexpr auto
1367 variable_size_binary_array_impl<T, CR, OT, Ext>::insert_value(const_value_iterator pos, U value, size_type count)
1369 {
1370 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
1371 const OT offset_begin = *offset(idx);
1372 const std::vector<uint8_t> casted_value{value.cbegin(), value.cend()};
1373 const repeat_view<std::vector<uint8_t>> my_repeat_view{casted_value, count};
1374 const auto joined_repeated_value_range = std::ranges::views::join(my_repeat_view);
1375 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1376 const auto pos_to_insert = sparrow::next(data_buffer.cbegin(), offset_begin);
1377 data_buffer.insert(pos_to_insert, joined_repeated_value_range.begin(), joined_repeated_value_range.end());
1378 insert_offset(offsets_cbegin() + idx + 1, static_cast<offset_type>(value.size()), count);
1379 return sparrow::next(value_begin(), idx);
1380 }
1381
1382 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1383 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_offset(
1384 const_offset_iterator pos,
1385 offset_type value_size,
1386 size_type count
1387 ) -> offset_iterator
1388 {
1389 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1390 const auto idx = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
1391 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1392 const offset_type cumulative_size = value_size * static_cast<offset_type>(count);
1393
1394 // Check for offset overflow before adjusting
1395 if (!offset_buffer_adaptor.empty())
1396 {
1397 const offset_type last_offset = offset_buffer_adaptor.back();
1398 check_offset_overflow(last_offset, cumulative_size);
1399 }
1400
1401 // Adjust offsets for subsequent elements
1402 std::for_each(
1403 sparrow::next(offset_buffer_adaptor.begin(), idx + 1),
1404 offset_buffer_adaptor.end(),
1405 [cumulative_size](auto& offset)
1406 {
1407 offset += cumulative_size;
1408 }
1409 );
1410 offset_buffer_adaptor.insert(sparrow::next(offset_buffer_adaptor.cbegin(), idx + 1), count, 0);
1411 // Put the right values in the new offsets
1412 for (size_t i = idx + 1; i < idx + 1 + count; ++i)
1413 {
1414 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + value_size;
1415 }
1416 return offsets_begin() + idx;
1417 }
1418
1419 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1420 template <mpl::iterator_of_type<T> InputIt>
1421 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_values(
1423 InputIt first,
1424 InputIt last
1425 ) -> value_iterator
1426 {
1427 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1428 auto data_buffer_adaptor = make_buffer_adaptor<data_value_type>(data_buffer);
1429 auto values = std::ranges::subrange(first, last);
1430 const size_t cumulative_sizes = std::accumulate(
1431 values.begin(),
1432 values.end(),
1433 size_t(0),
1434 [](size_t acc, const T& value)
1435 {
1436 return acc + value.size();
1437 }
1438 );
1439 data_buffer_adaptor.resize(data_buffer_adaptor.size() + cumulative_sizes);
1440 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
1441 const OT offset_begin = *offset(idx);
1442 auto insert_pos = sparrow::next(data_buffer_adaptor.begin(), offset_begin);
1443
1444 // Move elements to make space for the new value
1445 std::move_backward(
1446 insert_pos,
1447 sparrow::next(data_buffer_adaptor.end(), -static_cast<difference_type>(cumulative_sizes)),
1448 data_buffer_adaptor.end()
1449 );
1450
1451 for (const T& value : values)
1452 {
1453 std::copy(value.begin(), value.end(), insert_pos);
1454 std::advance(insert_pos, value.size());
1455 }
1456
1457 const auto sizes_of_each_value = std::ranges::views::transform(
1458 values,
1459 [](const T& value) -> offset_type
1460 {
1461 return static_cast<offset_type>(value.size());
1462 }
1463 );
1464 insert_offsets(offset(idx + 1), sizes_of_each_value.begin(), sizes_of_each_value.end());
1465 return sparrow::next(value_begin(), idx);
1466 }
1467
1468 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1469 template <mpl::iterator_of_type<OT> InputIt>
1470 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_offsets(
1472 InputIt first_sizes,
1473 InputIt last_sizes
1474 ) -> offset_iterator
1475 {
1476 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
1477 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
1478 SPARROW_ASSERT_TRUE(first_sizes <= last_sizes);
1479 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1480 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1481 const auto idx = std::distance(offsets_cbegin(), pos);
1482 const OT cumulative_sizes = std::reduce(first_sizes, last_sizes, OT(0));
1483
1484 // Check for offset overflow before adjusting
1485 if (!offset_buffer_adaptor.empty())
1486 {
1487 const offset_type last_offset = offset_buffer_adaptor.back();
1488 check_offset_overflow(last_offset, cumulative_sizes);
1489 }
1490
1491 const auto sizes_count = std::distance(first_sizes, last_sizes);
1492 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() + static_cast<size_t>(sizes_count));
1493 // Move the offsets to make space for the new offsets
1494 std::move_backward(
1495 offset_buffer_adaptor.begin() + idx,
1496 offset_buffer_adaptor.end() - sizes_count,
1497 offset_buffer_adaptor.end()
1498 );
1499 // Adjust offsets for subsequent elements
1500 std::for_each(
1501 offset_buffer_adaptor.begin() + idx + sizes_count,
1502 offset_buffer_adaptor.end(),
1503 [cumulative_sizes](auto& offset)
1504 {
1505 offset += cumulative_sizes;
1506 }
1507 );
1508 // Put the right values in the new offsets
1509 InputIt it = first_sizes;
1510 for (size_t i = static_cast<size_t>(idx + 1); i < static_cast<size_t>(idx + sizes_count + 1); ++i)
1511 {
1512 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + *it;
1513 ++it;
1514 }
1515 return offset(static_cast<size_t>(idx));
1516 }
1517
1518 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1519 constexpr auto
1520 variable_size_binary_array_impl<T, CR, OT, Ext>::erase_values(const_value_iterator pos, size_type count)
1521 -> value_iterator
1522 {
1523 SPARROW_ASSERT_TRUE(pos >= value_cbegin());
1524 SPARROW_ASSERT_TRUE(pos <= value_cend());
1525 const size_t index = static_cast<size_t>(std::distance(value_cbegin(), pos));
1526 if (count == 0)
1527 {
1528 return sparrow::next(value_begin(), index);
1529 }
1530 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1531 const auto offset_begin = *offset(index);
1532 const auto offset_end = *offset(index + count);
1533 const size_t difference = static_cast<size_t>(offset_end - offset_begin);
1534 // move the values after the erased ones
1535 std::move(data_buffer.begin() + offset_end, data_buffer.end(), data_buffer.begin() + offset_begin);
1536 data_buffer.resize(data_buffer.size() - difference);
1537 // adjust the offsets for the subsequent elements
1538 erase_offsets(offset(index), count);
1539 return sparrow::next(value_begin(), index);
1540 }
1541
1542 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1543 constexpr auto
1544 variable_size_binary_array_impl<T, CR, OT, Ext>::erase_offsets(const_offset_iterator pos, size_type count)
1545 -> offset_iterator
1546 {
1547 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
1548 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
1549 const size_t index = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
1550 if (count == 0)
1551 {
1552 return offset(index);
1553 }
1554 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1555 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1556 const OT offset_start_value = *offset(index);
1557 const OT offset_end_value = *offset(index + count);
1558 const OT difference = offset_end_value - offset_start_value;
1559 // move the offsets after the erased ones
1560 std::move(
1561 sparrow::next(offset_buffer_adaptor.begin(), index + count + 1),
1562 offset_buffer_adaptor.end(),
1563 sparrow::next(offset_buffer_adaptor.begin(), index + 1)
1564 );
1565 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() - count);
1566 // adjust the offsets for the subsequent elements
1567 std::for_each(
1568 sparrow::next(offset_buffer_adaptor.begin(), index + 1),
1569 offset_buffer_adaptor.end(),
1570 [difference](OT& offset)
1571 {
1572 offset -= difference;
1573 }
1574 );
1575 return offset(index);
1576 }
1577
1578}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::iterator_tag iterator_tag
constexpr array_bitmap_base_impl & operator=(const array_bitmap_base_impl &)
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
Object that owns a piece of contiguous memory.
Definition buffer.hpp:131
constexpr size_type null_count() const noexcept
Returns the number of bits set to false (null/invalid).
typename storage_type::default_allocator default_allocator
A view that repeats a value a given number of times.
This buffer class is used as storage buffer for all sparrow arrays.
variable_size_binary_array_impl(variable_size_binary_array_impl &&rhs) noexcept=default
Move constructor.
variable_size_binary_array_impl(const variable_size_binary_array_impl &rhs)
Copy constructor.
static constexpr auto offset_from_sizes(SIZES_RANGE &&sizes) -> offset_buffer_type
Creates offset buffer from a range of sizes.
constexpr inner_reference value(size_type i)
Gets mutable reference to element at specified index.
variable_size_binary_array_impl(ARGS &&... args)
Generic constructor for creating array from various inputs.
constexpr inner_const_reference value(size_type i) const
Gets const reference to element at specified index.
variable_size_binary_array_impl & operator=(variable_size_binary_array_impl &&rhs) noexcept=default
Move assignment operator.
variable_size_binary_array_impl & operator=(const variable_size_binary_array_impl &rhs)
Copy assignment operator.
variable_size_binary_array_impl(arrow_proxy)
Constructs array from Arrow proxy.
Iterator over the data values of a variable size binary layout.
Concept for input containers that can provide metadata pairs.
Definition metadata.hpp:332
Concept for iterator types.
Concept for character-like types.
Concept for convertible range types.
Definition mp_utils.hpp:931
Concept defining valid input types for validity bitmap creation.
#define SPARROW_CONSTEXPR_GCC_11
Definition config.hpp:50
#define SPARROW_ASSERT_TRUE(expr__)
SPARROW_API void increase(const std::string &key)
std::string key()
Definition buffer.hpp:49
constexpr sparrow::u8_buffer< OFFSET_TYPE > offset_buffer_from_sizes(SIZES_RANGE &&sizes)
constexpr std::size_t size(typelist< T... >={})
Gets the count of types contained in a typelist.
Definition mp_utils.hpp:216
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
constexpr bool is_type_instance_of_v
Variable template for convenient access to is_type_instance_of.
Definition mp_utils.hpp:102
binary_array_impl< std::int64_t > big_binary_array
Type alias for variable-size binary arrays with 64-bit offsets.
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient alias for arrays with mutable validity bitmaps.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_big_string_array_v
Checks whether T is a big_string_array type.
constexpr bool is_string_array_v
Checks whether T is a string_array type.
variable_size_binary_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference, OT, Ext > binary_array_impl
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
string_array_impl< std::int64_t > big_string_array
Type alias for variable-size string arrays with 64-bit offsets.
string_array_impl< std::int32_t > string_array
Type alias for variable-size string arrays with 32-bit offsets.
constexpr bool is_big_binary_array_v
Checks whether T is a big_binary_array type.
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
Type alias for a validity bitmap using 8-bit storage blocks.
variable_size_binary_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference, OT, Ext > string_array_impl
constexpr bool is_binary_array_v
Checks whether T is a binary_array type.
binary_array_impl< std::int32_t > binary_array
Type alias for variable-size binary arrays with 32-bit offsets.
auto make_buffer_adaptor(FromBufferRef &buf)
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
Ensures a validity bitmap of the specified size from various input types.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
Extensions to the C++ standard library.
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
variable_size_binary_value_iterator< array_type, iterator_types > value_iterator
variable_size_binary_value_iterator< array_type, const_iterator_types > const_value_iterator
Base class for array_inner_types specializations.
Traits class that must be specialized by array implementations.
Provides compile-time information about Arrow data types.
Metafunction for retrieving the data_type of a typed array.