sparrow 1.3.0
Loading...
Searching...
No Matches
variable_size_binary_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <limits>
20#include <numeric>
21#include <optional>
22#include <ranges>
23#include <stdexcept>
24#include <string>
25#include <vector>
26
39
40namespace sparrow
41{
42 namespace detail
43 {
44 template <class T, class OT>
46
47 template <>
48 struct variable_size_binary_format<std::string, std::int32_t>
49 {
50 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
51 {
52 return "u";
53 }
54 };
55
56 template <>
57 struct variable_size_binary_format<std::string, std::int64_t>
58 {
59 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
60 {
61 return "U";
62 }
63 };
64
65 template <>
66 struct variable_size_binary_format<std::vector<byte_t>, std::int32_t>
67 {
68 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
69 {
70 return "z";
71 }
72 };
73
74 template <>
75 struct variable_size_binary_format<std::vector<byte_t>, std::int64_t>
76 {
77 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
78 {
79 return "Z";
80 }
81 };
82 }
83
84
85 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext = empty_extension>
86 class variable_size_binary_array_impl;
87
88 template <layout_offset OT, typename Ext = empty_extension>
92 OT,
93 Ext>;
94
95 template <layout_offset OT, typename Ext = empty_extension>
98 arrow_traits<std::vector<byte_t>>::const_reference,
99 OT,
100 Ext>;
101
115
129
143
157
158 namespace detail
159 {
160 template <>
162 {
163 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
164 {
166 }
167 };
168
169 template <>
171 {
172 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
173 {
175 }
176 };
177
178 template <>
180 {
181 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
182 {
184 }
185 };
186
187 template <>
189 {
190 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
191 {
193 }
194 };
195 }
196
200 template <class T>
201 constexpr bool is_string_array_v = std::same_as<T, string_array>;
202
206 template <class T>
207 constexpr bool is_big_string_array_v = std::same_as<T, big_string_array>;
208
212 template <class T>
213 constexpr bool is_binary_array_v = std::same_as<T, binary_array>;
214
218 template <class T>
219 constexpr bool is_big_binary_array_v = std::same_as<T, big_binary_array>;
220
221 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
223 {
225
229 using offset_type = OT;
230
231 using data_value_type = typename T::value_type;
232
233 using offset_iterator = OT*;
234 using const_offset_iterator = const OT*;
235
238
239 using iterator_tag = std::random_access_iterator_tag;
240
242
251
253
262
264
265 // using iterator = layout_iterator<array_type, false>;
266 // using const_iterator = layout_iterator<array_type, true, CR>;
267 };
268
277 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
279 : public mutable_array_bitmap_base<variable_size_binary_array_impl<T, CR, OT, Ext>>,
280 public Ext
281 {
282 private:
283
284 static_assert(
285 sizeof(std::ranges::range_value_t<T>) == sizeof(std::uint8_t),
286 "Only sequences of types with the same size as uint8_t are supported"
287 );
288
289 public:
290
293
295 using inner_value_type = typename inner_types::inner_value_type;
296 using inner_reference = typename inner_types::inner_reference;
297 using inner_const_reference = typename inner_types::inner_const_reference;
298
299 using offset_type = typename inner_types::offset_type;
304
306 using bitmap_reference = typename base_type::bitmap_reference;
309
313
314 using offset_iterator = typename inner_types::offset_iterator;
315 using const_offset_iterator = typename inner_types::const_offset_iterator;
316
320 using data_iterator = typename inner_types::data_iterator;
321
322 using const_data_iterator = typename inner_types::const_data_iterator;
323 using data_value_type = typename inner_types::data_value_type;
324
325 using value_iterator = typename inner_types::value_iterator;
326 using const_value_iterator = typename inner_types::const_value_iterator;
327
345
359 template <class... ARGS>
362 : self_type(create_proxy(std::forward<ARGS>(args)...))
363 {
364 }
365
366 using base_type::get_arrow_proxy;
367 using base_type::size;
368
381 [[nodiscard]] constexpr inner_reference value(size_type i);
382
398 [[nodiscard]] constexpr inner_const_reference value(size_type i) const;
399
416 template <std::ranges::range SIZES_RANGE>
417 [[nodiscard]] static constexpr auto offset_from_sizes(SIZES_RANGE&& sizes) -> offset_buffer_type;
418
419 private:
420
441 template <
444 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
445 [[nodiscard]] static arrow_proxy create_proxy(
446 u8_buffer<C>&& data_buffer,
447 offset_buffer_type&& list_offsets,
448 VB&& validity_input = validity_bitmap{},
449 std::optional<std::string_view> name = std::nullopt,
450 std::optional<METADATA_RANGE> metadata = std::nullopt
451 );
452
471 template <
472 std::ranges::input_range R,
474 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
475 requires(
476 std::ranges::input_range<std::ranges::range_value_t<R>>
478 )
479 [[nodiscard]] static arrow_proxy create_proxy(
480 R&& values,
481 VB&& validity_input = validity_bitmap{},
482 std::optional<std::string_view> name = std::nullopt,
483 std::optional<METADATA_RANGE> metadata = std::nullopt
484 );
485
503 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
504 requires(
505 std::ranges::input_range<std::ranges::range_value_t<R>>
506 && mpl::char_like<std::ranges::range_value_t<std::ranges::range_value_t<R>>>
507 )
508 [[nodiscard]] static arrow_proxy create_proxy(
509 R&& values,
510 bool nullable,
511 std::optional<std::string_view> name = std::nullopt,
512 std::optional<METADATA_RANGE> metadata = std::nullopt
513 );
514
529 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
530 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
531 [[nodiscard]] static arrow_proxy create_proxy(
532 R&&,
533 std::optional<std::string_view> name = std::nullopt,
534 std::optional<METADATA_RANGE> metadata = std::nullopt
535 );
536
554 template <mpl::char_like C, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
555 [[nodiscard]] static arrow_proxy create_proxy_impl(
556 u8_buffer<C>&& data_buffer,
557 offset_buffer_type&& list_offsets,
558 std::optional<validity_bitmap>&&,
559 std::optional<std::string_view> name = std::nullopt,
560 std::optional<METADATA_RANGE> metadata = std::nullopt
561 );
562
563 static constexpr size_t OFFSET_BUFFER_INDEX = 1;
564 static constexpr size_t DATA_BUFFER_INDEX = 2;
565
577 [[nodiscard]] constexpr offset_iterator offset(size_type i);
578
586 [[nodiscard]] constexpr offset_iterator offsets_begin();
587
595 [[nodiscard]] constexpr offset_iterator offsets_end();
596
608 [[nodiscard]] constexpr data_iterator data(size_type i);
609
617 [[nodiscard]] constexpr value_iterator value_begin();
618
626 [[nodiscard]] constexpr value_iterator value_end();
627
635 [[nodiscard]] constexpr const_value_iterator value_cbegin() const;
636
644 [[nodiscard]] constexpr const_value_iterator value_cend() const;
645
657 [[nodiscard]] constexpr const_offset_iterator offset(size_type i) const;
658
666 [[nodiscard]] constexpr const_offset_iterator offsets_cbegin() const;
667
675 [[nodiscard]] constexpr const_offset_iterator offsets_cend() const;
676
688 [[nodiscard]] constexpr const_data_iterator data(size_type i) const;
689
690 // Modifiers
691
704 template <std::ranges::sized_range U>
705 requires mpl::convertible_ranges<U, T>
706 constexpr void resize_values(size_type new_length, U value);
707
717 constexpr void resize_offsets(size_type new_length, offset_type offset_value);
718
734 template <std::ranges::sized_range U>
735 requires mpl::convertible_ranges<U, T>
736 constexpr value_iterator insert_value(const_value_iterator pos, U value, size_type count);
737
750 constexpr offset_iterator insert_offset(const_offset_iterator pos, offset_type size, size_type count);
751
768 template <mpl::iterator_of_type<T> InputIt>
769 constexpr value_iterator insert_values(const_value_iterator pos, InputIt first, InputIt last);
770
791 template <mpl::iterator_of_type<OT> InputIt>
792 constexpr offset_iterator
793 insert_offsets(const_offset_iterator pos, InputIt first_sizes, InputIt last_sizes);
794
812 constexpr value_iterator erase_values(const_value_iterator pos, size_type count);
813
830 constexpr offset_iterator erase_offsets(const_offset_iterator pos, size_type count);
831
847 template <std::ranges::sized_range U>
848 requires mpl::convertible_ranges<U, T>
849 constexpr void assign(U&& rhs, size_type index);
850
860 constexpr void check_offset_overflow(offset_type current_offset, offset_type size_to_add) const;
861
864 friend base_type;
867 };
868
869 /*********************************************
870 * variable_size_binary_array_impl implementation *
871 *********************************************/
872
873 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
875 : base_type(std::move(proxy))
876 {
877 const auto type = this->get_arrow_proxy().data_type();
880 || type == data_type::LARGE_BINARY
881 );
883 (((type == data_type::STRING || type == data_type::BINARY) && std::same_as<OT, int32_t>)
884 || ((type == data_type::LARGE_STRING || type == data_type::LARGE_BINARY)
885 && std::same_as<OT, int64_t>) )
886 );
887 }
888
889 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
890 template <std::ranges::range SIZES_RANGE>
893 {
895 std::forward<SIZES_RANGE>(sizes)
896 );
897 }
898
899 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
900 template <mpl::char_like C, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
901 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
902 u8_buffer<C>&& data_buffer,
903 offset_buffer_type&& offsets,
904 VB&& validity_input,
905 std::optional<std::string_view> name,
906 std::optional<METADATA_RANGE> metadata
907 )
908 {
909 const auto size = offsets.size() - 1;
910 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
911 const auto null_count = vbitmap.null_count();
912
915 std::move(name), // name
916 std::move(metadata), // metadata
917 std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE}), // flags,
918 nullptr, // children
919 repeat_view<bool>(true, 0),
920 nullptr, // dictionary
921 true
922
923 );
924 std::vector<buffer<std::uint8_t>> arr_buffs = {
925 std::move(vbitmap).extract_storage(),
926 std::move(offsets).extract_storage(),
927 std::move(data_buffer).extract_storage()
928 };
929
930 ArrowArray arr = make_arrow_array(
931 static_cast<std::int64_t>(size), // length
932 static_cast<int64_t>(null_count),
933 0, // offset
934 std::move(arr_buffs),
935 nullptr, // children
936 repeat_view<bool>(true, 0),
937 nullptr, // dictionary
938 true
939 );
940 return arrow_proxy{std::move(arr), std::move(schema)};
941 }
942
943 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
944 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
945 requires(
946 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
948 // range of char-like
949 )
950 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
951 R&& values,
952 VB&& validity_input,
953 std::optional<std::string_view> name,
954 std::optional<METADATA_RANGE> metadata
955 )
956 {
957 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
958
959 auto size_range = values
960 | std::views::transform(
961 [](const auto& v)
962 {
963 return std::ranges::size(v);
964 }
965 );
966 auto offset_buffer = offset_from_sizes(size_range);
967 auto data_buffer = u8_buffer<values_inner_value_type>(std::ranges::views::join(values));
968 return create_proxy(
969 std::move(data_buffer),
970 std::move(offset_buffer),
971 std::forward<VB>(validity_input),
972 std::forward<std::optional<std::string_view>>(name),
973 std::forward<std::optional<METADATA_RANGE>>(metadata)
974 );
975 }
976
977 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
978 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
979 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
980 arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
981 R&& range,
982 std::optional<std::string_view> name,
983 std::optional<METADATA_RANGE> metadata
984 )
985 {
986 // split into values and is_non_null ranges
987 const auto values = range
988 | std::views::transform(
989 [](const auto& v)
990 {
991 return v.get();
992 }
993 );
994 const auto is_non_null = range
995 | std::views::transform(
996 [](const auto& v)
997 {
998 return v.has_value();
999 }
1000 );
1001 return self_type::create_proxy(values, is_non_null, std::move(name), std::move(metadata));
1002 }
1003
1004 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1005 template <
1006 std::ranges::input_range R,
1007 input_metadata_container METADATA_RANGE>
1008 requires(
1009 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
1011 // range of
1012 // char-like
1013 )
1014 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy(
1015 R&& values,
1016 bool nullable,
1017 std::optional<std::string_view> name,
1018 std::optional<METADATA_RANGE> metadata
1019 )
1020 {
1021 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
1022 const size_t size = std::ranges::size(values);
1023 u8_buffer<values_inner_value_type> data_buffer(std::ranges::views::join(values));
1024 auto size_range = values
1025 | std::views::transform(
1026 [](const auto& v)
1027 {
1028 return std::ranges::size(v);
1029 }
1030 );
1031 auto offset_buffer = offset_from_sizes(size_range);
1032 return create_proxy_impl(
1033 std::move(data_buffer),
1034 std::move(offset_buffer),
1035 nullable ? std::make_optional<validity_bitmap>(nullptr, size) : std::nullopt,
1036 std::move(name),
1037 std::move(metadata)
1038 );
1039 }
1040
1041 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1042 template <mpl::char_like C, input_metadata_container METADATA_RANGE>
1043 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT, Ext>::create_proxy_impl(
1044 u8_buffer<C>&& data_buffer,
1045 offset_buffer_type&& list_offsets,
1046 std::optional<validity_bitmap>&& bitmap,
1047 std::optional<std::string_view> name,
1048 std::optional<METADATA_RANGE> metadata
1049 )
1050 {
1051 const auto size = list_offsets.size() - 1;
1052 const auto null_count = bitmap.has_value() ? bitmap->null_count() : 0;
1053
1054 const std::optional<std::unordered_set<sparrow::ArrowFlag>>
1055 flags = bitmap.has_value()
1056 ? std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE})
1057 : std::nullopt;
1058
1059 ArrowSchema schema = make_arrow_schema(
1061 std::move(name), // name
1062 std::move(metadata), // metadata
1063 flags, // flags,
1064 nullptr, // children
1065 repeat_view<bool>(true, 0),
1066 nullptr, // dictionary
1067 true
1068
1069 );
1070 std::vector<buffer<std::uint8_t>> arr_buffs = {
1071 bitmap.has_value() ? std::move(*bitmap).extract_storage() : buffer<std::uint8_t>{nullptr, 0},
1072 std::move(list_offsets).extract_storage(),
1073 std::move(data_buffer).extract_storage()
1074 };
1075
1076 ArrowArray arr = make_arrow_array(
1077 static_cast<std::int64_t>(size), // length
1078 static_cast<int64_t>(null_count),
1079 0, // offset
1080 std::move(arr_buffs),
1081 nullptr, // children
1082 repeat_view<bool>(true, 0),
1083 nullptr, // dictionary
1084 true
1085 );
1086 arrow_proxy proxy{std::move(arr), std::move(schema)};
1087 Ext::init(proxy);
1088 return proxy;
1089 }
1090
1091 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1092 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::data(size_type i) -> data_iterator
1093 {
1094 arrow_proxy& proxy = get_arrow_proxy();
1095 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
1096 return proxy.buffers()[DATA_BUFFER_INDEX].template data<data_value_type>() + i;
1097 }
1098
1099 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1100 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::data(size_type i) const
1101 -> const_data_iterator
1102 {
1103 const arrow_proxy& proxy = this->get_arrow_proxy();
1104 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
1105 return proxy.buffers()[DATA_BUFFER_INDEX].template data<const data_value_type>() + i;
1106 }
1107
1108 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1109 template <std::ranges::sized_range U>
1111 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::assign(U&& rhs, size_type index)
1112 {
1113 SPARROW_ASSERT_TRUE(index < size());
1114 const auto offset_beg = *offset(index);
1115 const auto offset_end = *offset(index + 1);
1116 const auto initial_value_length = offset_end - offset_beg;
1117 const auto new_value_length = static_cast<OT>(std::ranges::size(rhs));
1118 const OT shift_byte_count = new_value_length - initial_value_length;
1119 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1120 if (shift_byte_count != 0)
1121 {
1122 // Check for offset overflow before adjusting
1123 if (shift_byte_count > 0)
1124 {
1125 const offset_type last_offset = *offset(size());
1126 check_offset_overflow(last_offset, shift_byte_count);
1127 }
1128
1129 const auto shift_val_abs = static_cast<size_t>(std::abs(shift_byte_count));
1130 const auto new_data_buffer_size = shift_byte_count < 0 ? data_buffer.size() - shift_val_abs
1131 : data_buffer.size() + shift_val_abs;
1132
1133 if (shift_byte_count > 0)
1134 {
1135 data_buffer.resize(new_data_buffer_size);
1136 // Move elements to make space for the new value
1137 std::move_backward(
1138 sparrow::next(data_buffer.begin(), offset_end),
1139 sparrow::next(data_buffer.end(), -shift_byte_count),
1140 data_buffer.end()
1141 );
1142 }
1143 else
1144 {
1145 std::move(
1146 sparrow::next(data_buffer.begin(), offset_end),
1147 data_buffer.end(),
1148 sparrow::next(data_buffer.begin(), offset_end + shift_byte_count)
1149 );
1150 data_buffer.resize(new_data_buffer_size);
1151 }
1152 // Adjust offsets for subsequent elements
1153 std::for_each(
1154 offset(index + 1),
1155 offset(size() + 1),
1156 [shift_byte_count](auto& offset)
1157 {
1158 offset += shift_byte_count;
1159 }
1160 );
1161 }
1162 auto tmp = std::views::transform(
1163 rhs,
1164 [](const auto& val)
1165 {
1166 return static_cast<std::uint8_t>(val);
1167 }
1168 );
1169 // Copy the new value into the buffer
1170 std::copy(std::ranges::begin(tmp), std::ranges::end(tmp), sparrow::next(data_buffer.begin(), offset_beg));
1171 }
1172
1173 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1174 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::check_offset_overflow(
1175 offset_type current_offset,
1176 offset_type size_to_add
1177 ) const
1178 {
1179 constexpr offset_type max_offset = std::numeric_limits<offset_type>::max();
1180 if (current_offset > max_offset - size_to_add)
1181 {
1182 throw std::overflow_error("Offset overflow: adding elements would exceed maximum offset value");
1183 }
1184 }
1185
1186 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1187 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offset(size_type i) -> offset_iterator
1188 {
1189 SPARROW_ASSERT_TRUE(i <= size() + this->get_arrow_proxy().offset());
1190 return get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
1191 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
1192 }
1193
1194 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1195 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offset(size_type i) const
1196 -> const_offset_iterator
1197 {
1198 SPARROW_ASSERT_TRUE(i <= this->size() + this->get_arrow_proxy().offset());
1199 return this->get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
1200 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
1201 }
1202
1203 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1204 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_begin() -> offset_iterator
1205 {
1206 return offset(0);
1207 }
1208
1209 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1210 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_cbegin() const
1211 -> const_offset_iterator
1212 {
1213 return offset(0);
1214 }
1215
1216 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1217 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_end() -> offset_iterator
1218 {
1219 return offset(size() + 1);
1220 }
1221
1222 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1223 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::offsets_cend() const
1224 -> const_offset_iterator
1225 {
1226 return offset(size() + 1);
1227 }
1228
1229 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1231 {
1232 SPARROW_ASSERT_TRUE(i < size());
1233 return inner_reference(this, i);
1234 }
1235
1236 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1239 {
1240 SPARROW_ASSERT_TRUE(i < this->size());
1241 const OT offset_begin = *offset(i);
1242 SPARROW_ASSERT_TRUE(offset_begin >= 0);
1243 const OT offset_end = *offset(i + 1);
1244 SPARROW_ASSERT_TRUE(offset_end >= 0);
1245 const const_data_iterator pointer_begin = data(static_cast<size_t>(offset_begin));
1246 const const_data_iterator pointer_end = data(static_cast<size_t>(offset_end));
1247 return inner_const_reference(pointer_begin, pointer_end);
1248 }
1249
1250 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1251 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_begin() -> value_iterator
1252 {
1253 return value_iterator{this, 0};
1254 }
1255
1256 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1257 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_end() -> value_iterator
1258 {
1259 return sparrow::next(value_begin(), size());
1260 }
1261
1262 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1263 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_cbegin() const -> const_value_iterator
1264 {
1265 return const_value_iterator{this, 0};
1266 }
1267
1268 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1269 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::value_cend() const -> const_value_iterator
1270 {
1271 return sparrow::next(value_cbegin(), this->size());
1272 }
1273
1274 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1275 template <std::ranges::sized_range U>
1277 constexpr void variable_size_binary_array_impl<T, CR, OT, Ext>::resize_values(size_type new_length, U value)
1278 {
1279 const size_t new_size = new_length + static_cast<size_t>(this->get_arrow_proxy().offset());
1280 auto& buffers = this->get_arrow_proxy().get_array_private_data()->buffers();
1281 if (new_length < size())
1282 {
1283 const auto offset_begin = static_cast<size_t>(*offset(new_length));
1284 auto& data_buffer = buffers[DATA_BUFFER_INDEX];
1285 data_buffer.resize(offset_begin);
1286 auto& offset_buffer = buffers[OFFSET_BUFFER_INDEX];
1287 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1288 offset_buffer_adaptor.resize(new_size + 1);
1289 }
1290 else if (new_length > size())
1291 {
1292 insert_value(value_cend(), value, new_length - size());
1293 }
1294 }
1295
1296 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1297 template <std::ranges::sized_range U>
1299 constexpr auto
1300 variable_size_binary_array_impl<T, CR, OT, Ext>::insert_value(const_value_iterator pos, U value, size_type count)
1302 {
1303 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
1304 const OT offset_begin = *offset(idx);
1305 const std::vector<uint8_t> casted_value{value.cbegin(), value.cend()};
1306 const repeat_view<std::vector<uint8_t>> my_repeat_view{casted_value, count};
1307 const auto joined_repeated_value_range = std::ranges::views::join(my_repeat_view);
1308 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1309 const auto pos_to_insert = sparrow::next(data_buffer.cbegin(), offset_begin);
1310 data_buffer.insert(pos_to_insert, joined_repeated_value_range.begin(), joined_repeated_value_range.end());
1311 insert_offset(offsets_cbegin() + idx + 1, static_cast<offset_type>(value.size()), count);
1312 return sparrow::next(value_begin(), idx);
1313 }
1314
1315 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1316 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_offset(
1317 const_offset_iterator pos,
1318 offset_type value_size,
1319 size_type count
1320 ) -> offset_iterator
1321 {
1322 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1323 const auto idx = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
1324 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1325 const offset_type cumulative_size = value_size * static_cast<offset_type>(count);
1326
1327 // Check for offset overflow before adjusting
1328 if (!offset_buffer_adaptor.empty())
1329 {
1330 const offset_type last_offset = offset_buffer_adaptor.back();
1331 check_offset_overflow(last_offset, cumulative_size);
1332 }
1333
1334 // Adjust offsets for subsequent elements
1335 std::for_each(
1336 sparrow::next(offset_buffer_adaptor.begin(), idx + 1),
1337 offset_buffer_adaptor.end(),
1338 [cumulative_size](auto& offset)
1339 {
1340 offset += cumulative_size;
1341 }
1342 );
1343 offset_buffer_adaptor.insert(sparrow::next(offset_buffer_adaptor.cbegin(), idx + 1), count, 0);
1344 // Put the right values in the new offsets
1345 for (size_t i = idx + 1; i < idx + 1 + count; ++i)
1346 {
1347 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + value_size;
1348 }
1349 return offsets_begin() + idx;
1350 }
1351
1352 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1353 template <mpl::iterator_of_type<T> InputIt>
1354 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_values(
1356 InputIt first,
1357 InputIt last
1358 ) -> value_iterator
1359 {
1360 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1361 auto data_buffer_adaptor = make_buffer_adaptor<data_value_type>(data_buffer);
1362 auto values = std::ranges::subrange(first, last);
1363 const size_t cumulative_sizes = std::accumulate(
1364 values.begin(),
1365 values.end(),
1366 size_t(0),
1367 [](size_t acc, const T& value)
1368 {
1369 return acc + value.size();
1370 }
1371 );
1372 data_buffer_adaptor.resize(data_buffer_adaptor.size() + cumulative_sizes);
1373 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
1374 const OT offset_begin = *offset(idx);
1375 auto insert_pos = sparrow::next(data_buffer_adaptor.begin(), offset_begin);
1376
1377 // Move elements to make space for the new value
1378 std::move_backward(
1379 insert_pos,
1380 sparrow::next(data_buffer_adaptor.end(), -static_cast<difference_type>(cumulative_sizes)),
1381 data_buffer_adaptor.end()
1382 );
1383
1384 for (const T& value : values)
1385 {
1386 std::copy(value.begin(), value.end(), insert_pos);
1387 std::advance(insert_pos, value.size());
1388 }
1389
1390 const auto sizes_of_each_value = std::ranges::views::transform(
1391 values,
1392 [](const T& value) -> offset_type
1393 {
1394 return static_cast<offset_type>(value.size());
1395 }
1396 );
1397 insert_offsets(offset(idx + 1), sizes_of_each_value.begin(), sizes_of_each_value.end());
1398 return sparrow::next(value_begin(), idx);
1399 }
1400
1401 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1402 template <mpl::iterator_of_type<OT> InputIt>
1403 constexpr auto variable_size_binary_array_impl<T, CR, OT, Ext>::insert_offsets(
1405 InputIt first_sizes,
1406 InputIt last_sizes
1407 ) -> offset_iterator
1408 {
1409 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
1410 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
1411 SPARROW_ASSERT_TRUE(first_sizes <= last_sizes);
1412 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1413 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1414 const auto idx = std::distance(offsets_cbegin(), pos);
1415 const OT cumulative_sizes = std::reduce(first_sizes, last_sizes, OT(0));
1416
1417 // Check for offset overflow before adjusting
1418 if (!offset_buffer_adaptor.empty())
1419 {
1420 const offset_type last_offset = offset_buffer_adaptor.back();
1421 check_offset_overflow(last_offset, cumulative_sizes);
1422 }
1423
1424 const auto sizes_count = std::distance(first_sizes, last_sizes);
1425 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() + static_cast<size_t>(sizes_count));
1426 // Move the offsets to make space for the new offsets
1427 std::move_backward(
1428 offset_buffer_adaptor.begin() + idx,
1429 offset_buffer_adaptor.end() - sizes_count,
1430 offset_buffer_adaptor.end()
1431 );
1432 // Adjust offsets for subsequent elements
1433 std::for_each(
1434 offset_buffer_adaptor.begin() + idx + sizes_count,
1435 offset_buffer_adaptor.end(),
1436 [cumulative_sizes](auto& offset)
1437 {
1438 offset += cumulative_sizes;
1439 }
1440 );
1441 // Put the right values in the new offsets
1442 InputIt it = first_sizes;
1443 for (size_t i = static_cast<size_t>(idx + 1); i < static_cast<size_t>(idx + sizes_count + 1); ++i)
1444 {
1445 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + *it;
1446 ++it;
1447 }
1448 return offset(static_cast<size_t>(idx));
1449 }
1450
1451 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1452 constexpr auto
1453 variable_size_binary_array_impl<T, CR, OT, Ext>::erase_values(const_value_iterator pos, size_type count)
1454 -> value_iterator
1455 {
1456 SPARROW_ASSERT_TRUE(pos >= value_cbegin());
1457 SPARROW_ASSERT_TRUE(pos <= value_cend());
1458 const size_t index = static_cast<size_t>(std::distance(value_cbegin(), pos));
1459 if (count == 0)
1460 {
1461 return sparrow::next(value_begin(), index);
1462 }
1463 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1464 const auto offset_begin = *offset(index);
1465 const auto offset_end = *offset(index + count);
1466 const size_t difference = static_cast<size_t>(offset_end - offset_begin);
1467 // move the values after the erased ones
1468 std::move(data_buffer.begin() + offset_end, data_buffer.end(), data_buffer.begin() + offset_begin);
1469 data_buffer.resize(data_buffer.size() - difference);
1470 // adjust the offsets for the subsequent elements
1471 erase_offsets(offset(index), count);
1472 return sparrow::next(value_begin(), index);
1473 }
1474
1475 template <std::ranges::sized_range T, class CR, layout_offset OT, typename Ext>
1476 constexpr auto
1477 variable_size_binary_array_impl<T, CR, OT, Ext>::erase_offsets(const_offset_iterator pos, size_type count)
1478 -> offset_iterator
1479 {
1480 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
1481 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
1482 const size_t index = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
1483 if (count == 0)
1484 {
1485 return offset(index);
1486 }
1487 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1488 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1489 const OT offset_start_value = *offset(index);
1490 const OT offset_end_value = *offset(index + count);
1491 const OT difference = offset_end_value - offset_start_value;
1492 // move the offsets after the erased ones
1493 std::move(
1494 sparrow::next(offset_buffer_adaptor.begin(), index + count + 1),
1495 offset_buffer_adaptor.end(),
1496 sparrow::next(offset_buffer_adaptor.begin(), index + 1)
1497 );
1498 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() - count);
1499 // adjust the offsets for the subsequent elements
1500 std::for_each(
1501 sparrow::next(offset_buffer_adaptor.begin(), index + 1),
1502 offset_buffer_adaptor.end(),
1503 [difference](OT& offset)
1504 {
1505 offset -= difference;
1506 }
1507 );
1508 return offset(index);
1509 }
1510
1511}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::iterator_tag iterator_tag
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
Object that owns a piece of contiguous memory.
Definition buffer.hpp:113
constexpr size_type null_count() const noexcept
Returns the number of bits set to false (null/invalid).
bitset_iterator< self_type, true > const_iterator
A view that repeats a value a given number of times.
This buffer class is used as storage buffer for all sparrow arrays.
static constexpr auto offset_from_sizes(SIZES_RANGE &&sizes) -> offset_buffer_type
Creates offset buffer from a range of sizes.
constexpr inner_reference value(size_type i)
Gets mutable reference to element at specified index.
variable_size_binary_array_impl(ARGS &&... args)
Generic constructor for creating array from various inputs.
constexpr inner_const_reference value(size_type i) const
Gets const reference to element at specified index.
variable_size_binary_array_impl(arrow_proxy)
Constructs array from Arrow proxy.
Iterator over the data values of a variable size binary layout.
Concept for input containers that can provide metadata pairs.
Definition metadata.hpp:332
Concept for iterator types.
Concept for character-like types.
Concept for convertible range types.
Definition mp_utils.hpp:931
Concept defining valid input types for validity bitmap creation.
#define SPARROW_CONSTEXPR_GCC_11
Definition config.hpp:50
#define SPARROW_ASSERT_TRUE(expr__)
constexpr sparrow::u8_buffer< OFFSET_TYPE > offset_buffer_from_sizes(SIZES_RANGE &&sizes)
constexpr std::size_t size(typelist< T... >={})
Gets the count of types contained in a typelist.
Definition mp_utils.hpp:216
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
binary_array_impl< std::int64_t > big_binary_array
Type alias for variable-size binary arrays with 64-bit offsets.
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient alias for arrays with mutable validity bitmaps.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_big_string_array_v
Checks whether T is a big_string_array type.
constexpr bool is_string_array_v
Checks whether T is a string_array type.
variable_size_binary_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference, OT, Ext > binary_array_impl
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
string_array_impl< std::int64_t > big_string_array
Type alias for variable-size string arrays with 64-bit offsets.
string_array_impl< std::int32_t > string_array
Type alias for variable-size string arrays with 32-bit offsets.
constexpr bool is_big_binary_array_v
Checks whether T is a big_binary_array type.
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
Type alias for a validity bitmap using 8-bit storage blocks.
variable_size_binary_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference, OT, Ext > string_array_impl
constexpr bool is_binary_array_v
Checks whether T is a binary_array type.
binary_array_impl< std::int32_t > binary_array
Type alias for variable-size binary arrays with 32-bit offsets.
auto make_buffer_adaptor(FromBufferRef &buf)
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
Ensures a validity bitmap of the specified size from various input types.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
variable_size_binary_value_iterator< array_type, iterator_types > value_iterator
variable_size_binary_value_iterator< array_type, const_iterator_types > const_value_iterator
Base class for array_inner_types specializations.
Traits class that must be specialized by array implementations.
Provides compile-time information about Arrow data types.
Metafunction for retrieving the data_type of a typed array.