sparrow 2.3.1
C++20 idiomatic APIs for the Apache Arrow Columnar Format
Loading...
Searching...
No Matches
variable_size_binary_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <limits>
20#include <numeric>
21#include <optional>
22#include <ranges>
23#include <stdexcept>
24#include <string>
25#include <vector>
26
41
42namespace sparrow
43{
44 namespace detail
45 {
46 template <class T, class OT>
48
49 template <>
50 struct variable_size_binary_format<std::string, std::int32_t>
51 {
52 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
53 {
54 return "u";
55 }
56 };
57
58 template <>
59 struct variable_size_binary_format<std::string, std::int64_t>
60 {
61 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
62 {
63 return "U";
64 }
65 };
66
67 template <>
68 struct variable_size_binary_format<std::vector<byte_t>, std::int32_t>
69 {
70 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
71 {
72 return "z";
73 }
74 };
75
76 template <>
77 struct variable_size_binary_format<std::vector<byte_t>, std::int64_t>
78 {
79 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
80 {
81 return "Z";
82 }
83 };
84 }
85
86
87 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext = empty_extension>
88 class variable_size_binary_array_impl;
89
90 namespace copy_tracker
91 {
92 template <typename T>
94 std::string key()
95 {
96 return "variable_size_binary_array";
97 }
98 }
99
100 template <layout_offset OT, typename Ext = empty_extension>
104 OT,
105 Ext>;
106
107 template <layout_offset OT, typename Ext = empty_extension>
110 arrow_traits<std::vector<byte_t>>::const_reference,
111 OT,
112 Ext>;
113
127
141
155
169
170 namespace detail
171 {
172 template <>
174 {
175 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
176 {
178 }
179 };
180
181 template <>
183 {
184 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
185 {
187 }
188 };
189
190 template <>
192 {
193 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
194 {
196 }
197 };
198
199 template <>
201 {
202 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
203 {
205 }
206 };
207 }
208
212 template <class T>
213 constexpr bool is_string_array_v = std::same_as<T, string_array>;
214
218 template <class T>
219 constexpr bool is_big_string_array_v = std::same_as<T, big_string_array>;
220
224 template <class T>
225 constexpr bool is_binary_array_v = std::same_as<T, binary_array>;
226
230 template <class T>
231 constexpr bool is_big_binary_array_v = std::same_as<T, big_binary_array>;
232
233 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
235 {
237
241 using offset_type = OT;
242
243 using data_value_type = typename T::value_type;
244
245 using offset_iterator = OT*;
246 using const_offset_iterator = const OT*;
247
250
251 using iterator_tag = std::random_access_iterator_tag;
252
254
263
265
274
276
277 // using iterator = layout_iterator<array_type, false>;
278 // using const_iterator = layout_iterator<array_type, true, CR>;
279 };
280
289 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
291 : public mutable_array_bitmap_base<variable_size_binary_array_impl<T, CR, OT, Ext>>,
292 public Ext
293 {
294 private:
295
296 static_assert(
297 sizeof(std::ranges::range_value_t<T>) == sizeof(std::uint8_t),
298 "Only sequences of types with the same size as uint8_t are supported"
299 );
300
301 public:
302
305
307 using inner_value_type = typename inner_types::inner_value_type;
308 using inner_reference = typename inner_types::inner_reference;
309 using inner_const_reference = typename inner_types::inner_const_reference;
310
311 using offset_type = typename inner_types::offset_type;
316
318 using bitmap_reference = typename base_type::bitmap_reference;
321
325
326 using offset_iterator = typename inner_types::offset_iterator;
327 using const_offset_iterator = typename inner_types::const_offset_iterator;
328
332 using data_iterator = typename inner_types::data_iterator;
333
334 using const_data_iterator = typename inner_types::const_data_iterator;
335 using data_value_type = typename inner_types::data_value_type;
336
337 using value_iterator = typename inner_types::value_iterator;
338 using const_value_iterator = typename inner_types::const_value_iterator;
339
357
367
375
382
390
404 template <class... ARGS>
407 : self_type(create_proxy(std::forward<ARGS>(args)...))
408 {
409 }
410
411 using base_type::get_arrow_proxy;
412 using base_type::size;
413
426 [[nodiscard]] constexpr inner_reference value(size_type i);
427
443 [[nodiscard]] constexpr inner_const_reference value(size_type i) const;
444
461 template <std::ranges::range SIZES_RANGE>
462 [[nodiscard]] static constexpr auto offset_from_sizes(SIZES_RANGE&& sizes) -> offset_buffer_type;
463
464 private:
465
486 template <
489 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
490 [[nodiscard]] static arrow_proxy create_proxy(
491 u8_buffer<C>&& data_buffer,
492 offset_buffer_type&& list_offsets,
494 std::optional<std::string_view> name = std::nullopt,
495 std::optional<METADATA_RANGE> metadata = std::nullopt
496 );
497
516 template <
517 std::ranges::input_range R,
519 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
520 requires(
521 std::ranges::input_range<std::ranges::range_value_t<R>>
523 )
524 [[nodiscard]] static arrow_proxy create_proxy(
525 R&& values,
527 std::optional<std::string_view> name = std::nullopt,
528 std::optional<METADATA_RANGE> metadata = std::nullopt
529 );
530
548 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
549 requires(
550 std::ranges::input_range<std::ranges::range_value_t<R>>
551 && mpl::char_like<std::ranges::range_value_t<std::ranges::range_value_t<R>>>
552 )
553 [[nodiscard]] static arrow_proxy create_proxy(
554 R&& values,
555 bool nullable,
556 std::optional<std::string_view> name = std::nullopt,
557 std::optional<METADATA_RANGE> metadata = std::nullopt
558 );
559
574 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
575 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
576 [[nodiscard]] static arrow_proxy create_proxy(
577 R&&,
578 std::optional<std::string_view> name = std::nullopt,
579 std::optional<METADATA_RANGE> metadata = std::nullopt
580 );
581
599 template <mpl::char_like C, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
600 [[nodiscard]] static arrow_proxy create_proxy_impl(
601 u8_buffer<C>&& data_buffer,
602 offset_buffer_type&& list_offsets,
603 std::optional<validity_bitmap>&&,
604 std::optional<std::string_view> name = std::nullopt,
605 std::optional<METADATA_RANGE> metadata = std::nullopt
606 );
607
608 static constexpr size_t OFFSET_BUFFER_INDEX = 1;
609 static constexpr size_t DATA_BUFFER_INDEX = 2;
610
622 [[nodiscard]] constexpr offset_iterator offset(size_type i);
623
631 [[nodiscard]] constexpr offset_iterator offsets_begin();
632
640 [[nodiscard]] constexpr offset_iterator offsets_end();
641
653 [[nodiscard]] constexpr data_iterator data(size_type i);
654
662 [[nodiscard]] constexpr value_iterator value_begin();
663
671 [[nodiscard]] constexpr value_iterator value_end();
672
680 [[nodiscard]] constexpr const_value_iterator value_cbegin() const;
681
689 [[nodiscard]] constexpr const_value_iterator value_cend() const;
690
702 [[nodiscard]] constexpr const_offset_iterator offset(size_type i) const;
703
711 [[nodiscard]] constexpr const_offset_iterator offsets_cbegin() const;
712
720 [[nodiscard]] constexpr const_offset_iterator offsets_cend() const;
721
733 [[nodiscard]] constexpr const_data_iterator data(size_type i) const;
734
735 // Modifiers
736
749 template <std::ranges::sized_range U>
750 requires mpl::convertible_ranges<U, T>
751 constexpr void resize_values(size_type new_length, U value);
752
762 constexpr void resize_offsets(size_type new_length, offset_type offset_value);
763
779 template <std::ranges::sized_range U>
780 requires mpl::convertible_ranges<U, T>
781 constexpr value_iterator insert_value(const_value_iterator pos, U value, size_type count);
782
795 constexpr offset_iterator insert_offset(const_offset_iterator pos, offset_type size, size_type count);
796
813 template <mpl::iterator_of_type<T> InputIt>
814 constexpr value_iterator insert_values(const_value_iterator pos, InputIt first, InputIt last);
815
836 template <mpl::iterator_of_type<OT> InputIt>
837 constexpr offset_iterator
838 insert_offsets(const_offset_iterator pos, InputIt first_sizes, InputIt last_sizes);
839
857 constexpr value_iterator erase_values(const_value_iterator pos, size_type count);
858
875 constexpr offset_iterator erase_offsets(const_offset_iterator pos, size_type count);
876
892 template <std::ranges::sized_range U>
893 requires mpl::convertible_ranges<U, T>
894 constexpr void assign(U&& rhs, size_type index);
895
896 [[nodiscard]] constexpr std::ptrdiff_t data_buffer_distance(offset_type byte_offset) const;
897
907 constexpr void check_offset_overflow(offset_type current_offset, offset_type size_to_add) const;
908
911 friend base_type;
914 };
915
916 /*********************************************
917 * variable_size_binary_array_impl implementation *
918 *********************************************/
919
920 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
922 : base_type(std::move(proxy))
923 {
924 const auto type = this->get_arrow_proxy().data_type();
927 || type == data_type::LARGE_BINARY
928 );
930 (((type == data_type::STRING || type == data_type::BINARY) && std::same_as<OT, int32_t>)
931 || ((type == data_type::LARGE_STRING || type == data_type::LARGE_BINARY)
932 && std::same_as<OT, int64_t>) )
933 );
934 }
935
936 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
944
945 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
953
954 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
955 template <std::ranges::range SIZES_RANGE>
958 {
960 std::forward<SIZES_RANGE>(sizes)
961 );
962 }
963
964 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
965 template <mpl::char_like C, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
966 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
967 u8_buffer<C>&& data_buffer,
968 offset_buffer_type&& offsets,
969 VB&& validity_input,
970 std::optional<std::string_view> name,
971 std::optional<METADATA_RANGE> metadata
972 )
973 {
974 const auto size = offsets.size() - 1;
975 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
976 const auto null_count = vbitmap.null_count();
977
980 std::move(name), // name
981 std::move(metadata), // metadata
982 std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE}), // flags,
983 nullptr, // children
984 repeat_view<bool>(true, 0),
985 nullptr, // dictionary
986 true
987
988 );
989 std::vector<buffer<std::uint8_t>> arr_buffs;
990 arr_buffs.reserve(3);
991 arr_buffs.emplace_back(std::move(vbitmap).extract_storage());
992 arr_buffs.emplace_back(std::move(offsets).extract_storage());
993 arr_buffs.emplace_back(std::move(data_buffer).extract_storage());
994
996 static_cast<std::int64_t>(size), // length
997 static_cast<int64_t>(null_count),
998 0, // offset
999 std::move(arr_buffs),
1000 nullptr, // children
1001 repeat_view<bool>(true, 0),
1002 nullptr, // dictionary
1003 true
1004 );
1005 return arrow_proxy{std::move(arr), std::move(schema)};
1006 }
1007
1008 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1009 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
1010 requires(
1011 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
1013 // range of char-like
1014 )
1015 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
1016 R&& values,
1017 VB&& validity_input,
1018 std::optional<std::string_view> name,
1019 std::optional<METADATA_RANGE> metadata
1020 )
1021 {
1022 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
1023
1024 auto size_range = values
1025 | std::views::transform(
1026 [](const auto& v)
1027 {
1028 return std::ranges::size(v);
1029 }
1030 );
1031 auto offset_buffer = offset_from_sizes(size_range);
1032 auto data_buffer = u8_buffer<values_inner_value_type>(std::ranges::views::join(values));
1033 return create_proxy(
1034 std::move(data_buffer),
1035 std::move(offset_buffer),
1036 std::forward<VB>(validity_input),
1037 std::forward<std::optional<std::string_view>>(name),
1038 std::forward<std::optional<METADATA_RANGE>>(metadata)
1039 );
1040 }
1041
1042 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1043 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
1044 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
1045 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
1046 R&& range,
1047 std::optional<std::string_view> name,
1048 std::optional<METADATA_RANGE> metadata
1049 )
1050 {
1051 // split into values and is_non_null ranges
1052 const auto values = range | std::views::transform(nullable_get);
1053 const auto is_non_null = range
1054 | std::views::transform(
1055 [](const auto& v)
1056 {
1057 return v.has_value();
1058 }
1059 );
1060 return self_type::create_proxy(values, is_non_null, std::move(name), std::move(metadata));
1061 }
1062
1063 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1064 template <
1065 std::ranges::input_range R,
1066 input_metadata_container METADATA_RANGE>
1067 requires(
1068 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
1070 // range of
1071 // char-like
1072 )
1073 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
1074 R&& values,
1075 bool nullable,
1076 std::optional<std::string_view> name,
1077 std::optional<METADATA_RANGE> metadata
1078 )
1079 {
1080 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
1081 const size_t size = std::ranges::size(values);
1082 u8_buffer<values_inner_value_type> data_buffer(std::ranges::views::join(values));
1083 auto size_range = values
1084 | std::views::transform(
1085 [](const auto& v)
1086 {
1087 return std::ranges::size(v);
1088 }
1089 );
1090 auto offset_buffer = offset_from_sizes(size_range);
1091 return create_proxy_impl(
1092 std::move(data_buffer),
1093 std::move(offset_buffer),
1094 nullable ? std::make_optional<validity_bitmap>(nullptr, size, validity_bitmap::default_allocator())
1095 : std::nullopt,
1096 std::move(name),
1097 std::move(metadata)
1098 );
1099 }
1100
1101 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1102 template <mpl::char_like C, input_metadata_container METADATA_RANGE>
1103 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy_impl(
1104 u8_buffer<C>&& data_buffer,
1105 offset_buffer_type&& list_offsets,
1106 std::optional<validity_bitmap>&& bitmap,
1107 std::optional<std::string_view> name,
1108 std::optional<METADATA_RANGE> metadata
1109 )
1110 {
1111 const auto size = list_offsets.size() - 1;
1112 const auto null_count = bitmap.has_value() ? bitmap->null_count() : 0;
1113
1114 const std::optional<std::unordered_set<sparrow::ArrowFlag>>
1115 flags = bitmap.has_value()
1116 ? std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE})
1117 : std::nullopt;
1118
1119 ArrowSchema schema = make_arrow_schema(
1121 std::move(name), // name
1122 std::move(metadata), // metadata
1123 flags, // flags,
1124 nullptr, // children
1125 repeat_view<bool>(true, 0),
1126 nullptr, // dictionary
1127 true
1128
1129 );
1130 std::vector<buffer<std::uint8_t>> arr_buffs;
1131 arr_buffs.reserve(3);
1132 arr_buffs.emplace_back(
1133 bitmap.has_value() ? std::move(*bitmap).extract_storage()
1134 : buffer<std::uint8_t>{nullptr, 0, buffer<std::uint8_t>::default_allocator()}
1135 );
1136 arr_buffs.emplace_back(std::move(list_offsets).extract_storage());
1137 arr_buffs.emplace_back(std::move(data_buffer).extract_storage());
1138
1139 ArrowArray arr = make_arrow_array(
1140 static_cast<std::int64_t>(size), // length
1141 static_cast<int64_t>(null_count),
1142 0, // offset
1143 std::move(arr_buffs),
1144 nullptr, // children
1145 repeat_view<bool>(true, 0),
1146 nullptr, // dictionary
1147 true
1148 );
1149 arrow_proxy proxy{std::move(arr), std::move(schema)};
1150 Ext::init(proxy);
1151 return proxy;
1152 }
1153
1154 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1155 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::data(size_type i) -> data_iterator
1156 {
1157 arrow_proxy& proxy = get_arrow_proxy();
1158 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
1159 return proxy.buffers()[DATA_BUFFER_INDEX].template data<data_value_type>() + i;
1160 }
1161
1162 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1163 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::data(size_type i) const
1164 -> const_data_iterator
1165 {
1166 const arrow_proxy& proxy = this->get_arrow_proxy();
1167 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
1168 return proxy.buffers()[DATA_BUFFER_INDEX].template data<const data_value_type>() + i;
1169 }
1170
1171 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1172 template <std::ranges::sized_range U>
1174 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::assign(U&& rhs, size_type index)
1175 {
1176 SPARROW_ASSERT_TRUE(index < size());
1177 const auto offset_beg = *offset(index);
1178 const auto offset_end = *offset(index + 1);
1179 const auto initial_value_length = offset_end - offset_beg;
1180 const auto new_value_length = static_cast<OT>(std::ranges::size(rhs));
1181 const OT shift_byte_count = new_value_length - initial_value_length;
1182 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1183 if (shift_byte_count != 0)
1184 {
1185 // Check for offset overflow before adjusting
1186 if (shift_byte_count > 0)
1187 {
1188 const offset_type last_offset = *offset(size());
1189 check_offset_overflow(last_offset, shift_byte_count);
1190 }
1191
1192 const auto shift_val_abs = static_cast<size_t>(std::abs(shift_byte_count));
1193 const auto new_data_buffer_size = shift_byte_count < 0 ? data_buffer.size() - shift_val_abs
1194 : data_buffer.size() + shift_val_abs;
1195
1196 if (shift_byte_count > 0)
1197 {
1198 data_buffer.resize(new_data_buffer_size);
1199 auto* data_begin = data_buffer.template data<data_value_type>();
1200 auto* data_end = data_begin + static_cast<std::ptrdiff_t>(data_buffer.size());
1201 // Move elements to make space for the new value
1202 std::move_backward(
1203 data_begin + data_buffer_distance(offset_end),
1204 data_end - data_buffer_distance(shift_byte_count),
1205 data_end
1206 );
1207 }
1208 else
1209 {
1210 auto* data_begin = data_buffer.template data<data_value_type>();
1211 auto* data_end = data_begin + static_cast<std::ptrdiff_t>(data_buffer.size());
1212 std::move(
1213 data_begin + data_buffer_distance(offset_end),
1214 data_end,
1215 data_begin + data_buffer_distance(offset_end + shift_byte_count)
1216 );
1217 data_buffer.resize(new_data_buffer_size);
1218 }
1219 // Adjust offsets for subsequent elements
1220 std::for_each(
1221 offset(index + 1),
1222 offset(size() + 1),
1223 [shift_byte_count](auto& offset)
1224 {
1225 offset += shift_byte_count;
1226 }
1227 );
1228 }
1229 auto tmp = std::views::transform(
1230 rhs,
1231 [](const auto& val)
1232 {
1233 return static_cast<data_value_type>(val);
1234 }
1235 );
1236 auto* data_begin = data_buffer.template data<data_value_type>();
1237 // Copy the new value into the buffer
1238 std::copy(std::ranges::begin(tmp), std::ranges::end(tmp), data_begin + data_buffer_distance(offset_beg));
1239 }
1240
1241 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1242 constexpr auto
1243 variable_size_binary_array_impl<T, CR, OT, Ext>::data_buffer_distance(offset_type byte_offset) const
1244 -> std::ptrdiff_t
1245 {
1246 using promoted_type = std::common_type_t<offset_type, std::ptrdiff_t>;
1248 static_cast<promoted_type>(byte_offset)
1249 >= static_cast<promoted_type>(std::numeric_limits<std::ptrdiff_t>::min())
1250 );
1252 static_cast<promoted_type>(byte_offset)
1253 <= static_cast<promoted_type>(std::numeric_limits<std::ptrdiff_t>::max())
1254 );
1255 return static_cast<std::ptrdiff_t>(byte_offset);
1256 }
1257
1258 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1259 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::check_offset_overflow(
1260 offset_type current_offset,
1261 offset_type size_to_add
1262 ) const
1263 {
1264 constexpr offset_type max_offset = std::numeric_limits<offset_type>::max();
1265 if (current_offset > max_offset - size_to_add)
1266 {
1267 throw std::overflow_error("Offset overflow: adding elements would exceed maximum offset value");
1268 }
1269 }
1270
1271 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1272 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offset(size_type i) -> offset_iterator
1273 {
1274 SPARROW_ASSERT_TRUE(i <= size() + this->get_arrow_proxy().offset());
1275 return get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
1276 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
1277 }
1278
1279 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1280 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offset(size_type i) const
1281 -> const_offset_iterator
1282 {
1283 SPARROW_ASSERT_TRUE(i <= this->size() + this->get_arrow_proxy().offset());
1284 return this->get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
1285 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
1286 }
1287
1288 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1289 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_begin() -> offset_iterator
1290 {
1291 return offset(0);
1292 }
1293
1294 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1295 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_cbegin() const
1296 -> const_offset_iterator
1297 {
1298 return offset(0);
1299 }
1300
1301 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1302 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_end() -> offset_iterator
1303 {
1304 return offset(size() + 1);
1305 }
1306
1307 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1308 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_cend() const
1309 -> const_offset_iterator
1310 {
1311 return offset(size() + 1);
1312 }
1313
1314 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1316 {
1317 SPARROW_ASSERT_TRUE(i < size());
1318 return inner_reference(this, i);
1319 }
1320
1321 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1324 {
1325 SPARROW_ASSERT_TRUE(i < this->size());
1326 const OT offset_begin = *offset(i);
1327 SPARROW_ASSERT_TRUE(offset_begin >= 0);
1328 const OT offset_end = *offset(i + 1);
1329 SPARROW_ASSERT_TRUE(offset_end >= 0);
1330 const const_data_iterator pointer_begin = data(static_cast<size_t>(offset_begin));
1331 const const_data_iterator pointer_end = data(static_cast<size_t>(offset_end));
1332 return inner_const_reference(pointer_begin, pointer_end);
1333 }
1334
1335 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1336 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_begin() -> value_iterator
1337 {
1338 return value_iterator{this, 0};
1339 }
1340
1341 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1342 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_end() -> value_iterator
1343 {
1344 return sparrow::next(value_begin(), size());
1345 }
1346
1347 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1348 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_cbegin() const -> const_value_iterator
1349 {
1350 return const_value_iterator{this, 0};
1351 }
1352
1353 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1354 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_cend() const -> const_value_iterator
1355 {
1356 return sparrow::next(value_cbegin(), this->size());
1357 }
1358
1359 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1360 template <std::ranges::sized_range U>
1362 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::resize_values(size_type new_length, U value)
1363 {
1364 const size_t new_size = new_length + static_cast<size_t>(this->get_arrow_proxy().offset());
1365 auto& buffers = this->get_arrow_proxy().get_array_private_data()->buffers();
1366 if (new_length < size())
1367 {
1368 const auto offset_begin = static_cast<size_t>(*offset(new_length));
1369 auto& data_buffer = buffers[DATA_BUFFER_INDEX];
1370 data_buffer.resize(offset_begin);
1371 auto& offset_buffer = buffers[OFFSET_BUFFER_INDEX];
1372 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1373 offset_buffer_adaptor.resize(new_size + 1);
1374 }
1375 else if (new_length > size())
1376 {
1377 insert_value(value_cend(), value, new_length - size());
1378 }
1379 }
1380
1381 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1382 template <std::ranges::sized_range U>
1384 constexpr auto
1385 variable_size_binary_array_impl<T, CR, OT, Ext>::insert_value(const_value_iterator pos, U value, size_type count)
1387 {
1388 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
1389 const OT offset_begin = *offset(idx);
1390 auto casted_value = std::ranges::views::transform(
1391 value,
1392 [](const auto& val)
1393 {
1394 return static_cast<std::uint8_t>(val);
1395 }
1396 );
1397 const auto my_repeat_view = repeat_view{casted_value, count};
1398 const auto joined_repeated_value_range = std::ranges::views::join(my_repeat_view);
1399 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1400 const auto* data_begin = data_buffer.template data<std::uint8_t>();
1401 using data_buffer_type = std::remove_reference_t<decltype(data_buffer)>;
1402 const auto pos_to_insert = typename data_buffer_type::const_iterator(
1403 data_begin + data_buffer_distance(offset_begin)
1404 );
1405 data_buffer.insert(pos_to_insert, joined_repeated_value_range.begin(), joined_repeated_value_range.end());
1406 insert_offset(offsets_cbegin() + idx + 1, static_cast<offset_type>(value.size()), count);
1407 return sparrow::next(value_begin(), idx);
1408 }
1409
1410 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1411 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_offset(
1412 const_offset_iterator pos,
1413 offset_type value_size,
1414 size_type count
1415 ) -> offset_iterator
1416 {
1417 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1418 const auto idx = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
1419 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1420 const offset_type cumulative_size = value_size * static_cast<offset_type>(count);
1421
1422 // Check for offset overflow before adjusting
1423 if (!offset_buffer_adaptor.empty())
1424 {
1425 const offset_type last_offset = offset_buffer_adaptor.back();
1426 check_offset_overflow(last_offset, cumulative_size);
1427 }
1428
1429 // Adjust offsets for subsequent elements
1430 std::for_each(
1431 sparrow::next(offset_buffer_adaptor.begin(), idx + 1),
1432 offset_buffer_adaptor.end(),
1433 [cumulative_size](auto& offset)
1434 {
1435 offset += cumulative_size;
1436 }
1437 );
1438 offset_buffer_adaptor.insert(sparrow::next(offset_buffer_adaptor.cbegin(), idx + 1), count, 0);
1439 // Put the right values in the new offsets
1440 for (size_t i = idx + 1; i < idx + 1 + count; ++i)
1441 {
1442 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + value_size;
1443 }
1444 return offsets_begin() + idx;
1445 }
1446
1447 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1448 template <mpl::iterator_of_type<T> InputIt>
1449 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_values(
1451 InputIt first,
1452 InputIt last
1453 ) -> value_iterator
1454 {
1455 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1456 auto data_buffer_adaptor = make_buffer_adaptor<data_value_type>(data_buffer);
1457 auto values = std::ranges::subrange(first, last);
1458 const size_t cumulative_sizes = std::accumulate(
1459 values.begin(),
1460 values.end(),
1461 size_t(0),
1462 [](size_t acc, const T& value)
1463 {
1464 return acc + value.size();
1465 }
1466 );
1467 data_buffer_adaptor.resize(data_buffer_adaptor.size() + cumulative_sizes);
1468 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
1469 const OT offset_begin = *offset(idx);
1470 auto* data_begin = data_buffer_adaptor.data();
1471 auto* data_end = data_begin + static_cast<std::ptrdiff_t>(data_buffer_adaptor.size());
1472 auto* insert_pos = data_begin + data_buffer_distance(offset_begin);
1473
1474 // Move elements to make space for the new value
1475 std::move_backward(insert_pos, data_end - static_cast<std::ptrdiff_t>(cumulative_sizes), data_end);
1476
1477 for (const T& value : values)
1478 {
1479 std::copy(value.begin(), value.end(), insert_pos);
1480 std::advance(insert_pos, static_cast<std::ptrdiff_t>(value.size()));
1481 }
1482
1483 const auto sizes_of_each_value = std::ranges::views::transform(
1484 values,
1485 [](const T& value) -> offset_type
1486 {
1487 return static_cast<offset_type>(value.size());
1488 }
1489 );
1490 insert_offsets(offset(idx + 1), sizes_of_each_value.begin(), sizes_of_each_value.end());
1491 return sparrow::next(value_begin(), idx);
1492 }
1493
1494 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1495 template <mpl::iterator_of_type<OT> InputIt>
1496 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_offsets(
1498 InputIt first_sizes,
1499 InputIt last_sizes
1500 ) -> offset_iterator
1501 {
1502 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
1503 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
1504 SPARROW_ASSERT_TRUE(first_sizes <= last_sizes);
1505 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1506 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1507 const auto idx = std::distance(offsets_cbegin(), pos);
1508 // GCC 13 false-positive: transform_view iterators yield scalar OT values, never null
1509#ifdef __GNUC__
1510# pragma GCC diagnostic push
1511# pragma GCC diagnostic ignored "-Wnull-dereference"
1512#endif
1513 const OT cumulative_sizes = std::reduce(first_sizes, last_sizes, OT(0));
1514#ifdef __GNUC__
1515# pragma GCC diagnostic pop
1516#endif
1517
1518 // Check for offset overflow before adjusting
1519 if (!offset_buffer_adaptor.empty())
1520 {
1521 const offset_type last_offset = offset_buffer_adaptor.back();
1522 check_offset_overflow(last_offset, cumulative_sizes);
1523 }
1524
1525 const auto sizes_count = std::distance(first_sizes, last_sizes);
1526 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() + static_cast<size_t>(sizes_count));
1527 // Move the offsets to make space for the new offsets
1528 std::move_backward(
1529 offset_buffer_adaptor.begin() + idx,
1530 offset_buffer_adaptor.end() - sizes_count,
1531 offset_buffer_adaptor.end()
1532 );
1533 // Adjust offsets for subsequent elements
1534 std::for_each(
1535 offset_buffer_adaptor.begin() + idx + sizes_count,
1536 offset_buffer_adaptor.end(),
1537 [cumulative_sizes](auto& offset)
1538 {
1539 offset += cumulative_sizes;
1540 }
1541 );
1542 // Put the right values in the new offsets
1543 InputIt it = first_sizes;
1544 for (size_t i = static_cast<size_t>(idx + 1); i < static_cast<size_t>(idx + sizes_count + 1); ++i)
1545 {
1546 // GCC 13 false-positive: iterator is bounded and valid within the loop
1547#ifdef __GNUC__
1548# pragma GCC diagnostic push
1549# pragma GCC diagnostic ignored "-Wnull-dereference"
1550#endif
1551 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + *it;
1552#ifdef __GNUC__
1553# pragma GCC diagnostic pop
1554#endif
1555 ++it;
1556 }
1557 return offset(static_cast<size_t>(idx));
1558 }
1559
1560 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1561 constexpr auto
1562 variable_size_binary_array_impl<T, CR, OT, Ext>::erase_values(const_value_iterator pos, size_type count)
1563 -> value_iterator
1564 {
1565 SPARROW_ASSERT_TRUE(pos >= value_cbegin());
1566 SPARROW_ASSERT_TRUE(pos <= value_cend());
1567 const size_t index = static_cast<size_t>(std::distance(value_cbegin(), pos));
1568 if (count == 0)
1569 {
1570 return sparrow::next(value_begin(), index);
1571 }
1572 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1573 const auto offset_begin = *offset(index);
1574 const auto offset_end = *offset(index + count);
1575 const size_t difference = static_cast<size_t>(offset_end - offset_begin);
1576 auto* data_begin = data_buffer.template data<data_value_type>();
1577 auto* data_end = data_begin + static_cast<std::ptrdiff_t>(data_buffer.size());
1578 // move the values after the erased ones
1579 std::move(
1580 data_begin + data_buffer_distance(offset_end),
1581 data_end,
1582 data_begin + data_buffer_distance(offset_begin)
1583 );
1584 data_buffer.resize(data_buffer.size() - difference);
1585 // adjust the offsets for the subsequent elements
1586 erase_offsets(offset(index), count);
1587 return sparrow::next(value_begin(), index);
1588 }
1589
1590 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1591 constexpr auto
1592 variable_size_binary_array_impl<T, CR, OT, Ext>::erase_offsets(const_offset_iterator pos, size_type count)
1593 -> offset_iterator
1594 {
1595 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
1596 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
1597 const size_t index = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
1598 if (count == 0)
1599 {
1600 return offset(index);
1601 }
1602 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1603 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1604 const OT offset_start_value = *offset(index);
1605 const OT offset_end_value = *offset(index + count);
1606 const OT difference = offset_end_value - offset_start_value;
1607 // move the offsets after the erased ones
1608 std::move(
1609 sparrow::next(offset_buffer_adaptor.begin(), index + count + 1),
1610 offset_buffer_adaptor.end(),
1611 sparrow::next(offset_buffer_adaptor.begin(), index + 1)
1612 );
1613 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() - count);
1614 // adjust the offsets for the subsequent elements
1615 std::for_each(
1616 sparrow::next(offset_buffer_adaptor.begin(), index + 1),
1617 offset_buffer_adaptor.end(),
1618 [difference](OT& offset)
1619 {
1620 offset -= difference;
1621 }
1622 );
1623 return offset(index);
1624 }
1625
1626}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::iterator_tag iterator_tag
constexpr array_bitmap_base_impl & operator=(const array_bitmap_base_impl &)
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
Object that owns a piece of contiguous memory.
Definition buffer.hpp:131
constexpr size_type null_count() const noexcept
Returns the number of bits set to false (null/invalid).
typename storage_type::default_allocator default_allocator
A view that repeats a value a given number of times.
This buffer class is used as storage buffer for all sparrow arrays.
variable_size_binary_array_impl(variable_size_binary_array_impl &&rhs) noexcept=default
Move constructor.
variable_size_binary_array_impl(const variable_size_binary_array_impl &rhs)
Copy constructor.
static constexpr auto offset_from_sizes(SIZES_RANGE &&sizes) -> offset_buffer_type
Creates offset buffer from a range of sizes.
constexpr inner_reference value(size_type i)
Gets mutable reference to element at specified index.
variable_size_binary_array_impl(ARGS &&... args)
Generic constructor for creating array from various inputs.
constexpr inner_const_reference value(size_type i) const
Gets const reference to element at specified index.
variable_size_binary_array_impl & operator=(variable_size_binary_array_impl &&rhs) noexcept=default
Move assignment operator.
variable_size_binary_array_impl & operator=(const variable_size_binary_array_impl &rhs)
Copy assignment operator.
variable_size_binary_array_impl(arrow_proxy)
Constructs array from Arrow proxy.
Iterator over the data values of a variable size binary layout.
Concept for input containers that can provide metadata pairs.
Definition metadata.hpp:332
Concept for iterator types.
Concept for character-like types.
Concept for convertible range types.
Definition mp_utils.hpp:931
Concept defining valid input types for validity bitmap creation.
#define SPARROW_CONSTEXPR_GCC_11
Definition config.hpp:50
#define SPARROW_ASSERT_TRUE(expr__)
SPARROW_API void increase(const std::string &key)
std::string key()
Definition buffer.hpp:49
constexpr sparrow::u8_buffer< OFFSET_TYPE > offset_buffer_from_sizes(SIZES_RANGE &&sizes)
constexpr std::size_t size(typelist< T... >={})
Gets the count of types contained in a typelist.
Definition mp_utils.hpp:216
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
constexpr bool is_type_instance_of_v
Variable template for convenient access to is_type_instance_of.
Definition mp_utils.hpp:102
binary_array_impl< std::int64_t > big_binary_array
Type alias for variable-size binary arrays with 64-bit offsets.
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient alias for arrays with mutable validity bitmaps.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_big_string_array_v
Checks whether T is a big_string_array type.
constexpr bool is_string_array_v
Checks whether T is a string_array type.
variable_size_binary_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference, OT, Ext > binary_array_impl
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:605
string_array_impl< std::int64_t > big_string_array
Type alias for variable-size string arrays with 64-bit offsets.
string_array_impl< std::int32_t > string_array
Type alias for variable-size string arrays with 32-bit offsets.
constexpr bool is_big_binary_array_v
Checks whether T is a big_binary_array type.
constexpr nullable_get_fn nullable_get
Definition nullable.hpp:102
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
Type alias for a validity bitmap using 8-bit storage blocks.
variable_size_binary_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference, OT, Ext > string_array_impl
constexpr bool is_binary_array_v
Checks whether T is a binary_array type.
binary_array_impl< std::int32_t > binary_array
Type alias for variable-size binary arrays with 32-bit offsets.
auto make_buffer_adaptor(FromBufferRef &buf)
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
Ensures a validity bitmap of the specified size from various input types.
repeat_view(T &, size_t) -> repeat_view< T & >
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
Extensions to the C++ standard library.
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
variable_size_binary_value_iterator< array_type, iterator_types > value_iterator
variable_size_binary_value_iterator< array_type, const_iterator_types > const_value_iterator
Base class for array_inner_types specializations.
Traits class that must be specialized by array implementations.
Provides compile-time information about Arrow data types.
Metafunction for retrieving the data_type of a typed array.