sparrow 0.9.0
Loading...
Searching...
No Matches
variable_size_binary_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <numeric>
20#include <optional>
21#include <ranges>
22#include <string>
23#include <vector>
24
36
37namespace sparrow
38{
39 namespace detail
40 {
41 template <class T, class OT>
43
44 template <>
45 struct variable_size_binary_format<std::string, std::int32_t>
46 {
47 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
48 {
49 return "u";
50 }
51 };
52
53 template <>
54 struct variable_size_binary_format<std::string, std::int64_t>
55 {
56 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
57 {
58 return "U";
59 }
60 };
61
62 template <>
63 struct variable_size_binary_format<std::vector<byte_t>, std::int32_t>
64 {
65 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
66 {
67 return "z";
68 }
69 };
70
71 template <>
72 struct variable_size_binary_format<std::vector<byte_t>, std::int64_t>
73 {
74 [[nodiscard]] static SPARROW_CONSTEXPR_GCC_11 std::string format() noexcept
75 {
76 return "Z";
77 }
78 };
79 }
80
81 template <std::ranges::sized_range T, class CR, layout_offset OT>
82 class variable_size_binary_array_impl;
83
84 template <layout_offset OT>
88 OT>;
89
90 template <layout_offset OT>
93 arrow_traits<std::vector<byte_t>>::const_reference,
94 OT>;
95
109
123
137
151
152 namespace detail
153 {
154 template <>
156 {
157 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
158 {
160 }
161 };
162
163 template <>
165 {
166 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
167 {
169 }
170 };
171
172 template <>
174 {
175 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
176 {
178 }
179 };
180
181 template <>
183 {
184 [[nodiscard]] static constexpr sparrow::data_type get() noexcept
185 {
187 }
188 };
189 }
190
194 template <class T>
195 constexpr bool is_string_array_v = std::same_as<T, string_array>;
196
200 template <class T>
201 constexpr bool is_big_string_array_v = std::same_as<T, big_string_array>;
202
206 template <class T>
207 constexpr bool is_binary_array_v = std::same_as<T, binary_array>;
208
212 template <class T>
213 constexpr bool is_big_binary_array_v = std::same_as<T, big_binary_array>;
214
215 template <std::ranges::sized_range T, class CR, layout_offset OT>
217 {
219
223 using offset_type = OT;
224
225 using data_value_type = typename T::value_type;
226
227 using offset_iterator = OT*;
228 using const_offset_iterator = const OT*;
229
232
233 using iterator_tag = std::random_access_iterator_tag;
234
236
245
247
256
258
259 // using iterator = layout_iterator<array_type, false>;
260 // using const_iterator = layout_iterator<array_type, true, CR>;
261 };
262
271 template <std::ranges::sized_range T, class CR, layout_offset OT>
273 : public mutable_array_bitmap_base<variable_size_binary_array_impl<T, CR, OT>>
274 {
275 private:
276
277 static_assert(
278 sizeof(std::ranges::range_value_t<T>) == sizeof(std::uint8_t),
279 "Only sequences of types with the same size as uint8_t are supported"
280 );
281
282 public:
283
286
288 using inner_value_type = typename inner_types::inner_value_type;
289 using inner_reference = typename inner_types::inner_reference;
290 using inner_const_reference = typename inner_types::inner_const_reference;
291
292 using offset_type = typename inner_types::offset_type;
297
299 using bitmap_reference = typename base_type::bitmap_reference;
302
306
307 using offset_iterator = typename inner_types::offset_iterator;
308 using const_offset_iterator = typename inner_types::const_offset_iterator;
309
313 using data_iterator = typename inner_types::data_iterator;
314
315 using const_data_iterator = typename inner_types::const_data_iterator;
316 using data_value_type = typename inner_types::data_value_type;
317
318 using value_iterator = typename inner_types::value_iterator;
319 using const_value_iterator = typename inner_types::const_value_iterator;
320
338
352 template <class... ARGS>
355 : self_type(create_proxy(std::forward<ARGS>(args)...))
356 {
357 }
358
359 using base_type::get_arrow_proxy;
360 using base_type::size;
361
374 [[nodiscard]] constexpr inner_reference value(size_type i);
375
391 [[nodiscard]] constexpr inner_const_reference value(size_type i) const;
392
409 template <std::ranges::range SIZES_RANGE>
410 [[nodiscard]] static constexpr auto offset_from_sizes(SIZES_RANGE&& sizes) -> offset_buffer_type;
411
412 private:
413
434 template <
437 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
438 [[nodiscard]] static arrow_proxy create_proxy(
439 u8_buffer<C>&& data_buffer,
440 offset_buffer_type&& list_offsets,
441 VB&& validity_input = validity_bitmap{},
442 std::optional<std::string_view> name = std::nullopt,
443 std::optional<METADATA_RANGE> metadata = std::nullopt
444 );
445
464 template <
465 std::ranges::input_range R,
467 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
468 requires(
469 std::ranges::input_range<std::ranges::range_value_t<R>>
471 )
472 [[nodiscard]] static arrow_proxy create_proxy(
473 R&& values,
474 VB&& validity_input = validity_bitmap{},
475 std::optional<std::string_view> name = std::nullopt,
476 std::optional<METADATA_RANGE> metadata = std::nullopt
477 );
478
496 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
497 requires(
498 std::ranges::input_range<std::ranges::range_value_t<R>>
499 && mpl::char_like<std::ranges::range_value_t<std::ranges::range_value_t<R>>>
500 )
501 [[nodiscard]] static arrow_proxy create_proxy(
502 R&& values,
503 bool nullable,
504 std::optional<std::string_view> name = std::nullopt,
505 std::optional<METADATA_RANGE> metadata = std::nullopt
506 );
507
522 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
523 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
524 [[nodiscard]] static arrow_proxy create_proxy(
525 R&&,
526 std::optional<std::string_view> name = std::nullopt,
527 std::optional<METADATA_RANGE> metadata = std::nullopt
528 );
529
547 template <mpl::char_like C, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
548 [[nodiscard]] static arrow_proxy create_proxy_impl(
549 u8_buffer<C>&& data_buffer,
550 offset_buffer_type&& list_offsets,
551 std::optional<validity_bitmap>&&,
552 std::optional<std::string_view> name = std::nullopt,
553 std::optional<METADATA_RANGE> metadata = std::nullopt
554 );
555
556 static constexpr size_t OFFSET_BUFFER_INDEX = 1;
557 static constexpr size_t DATA_BUFFER_INDEX = 2;
558
570 [[nodiscard]] constexpr offset_iterator offset(size_type i);
571
579 [[nodiscard]] constexpr offset_iterator offsets_begin();
580
588 [[nodiscard]] constexpr offset_iterator offsets_end();
589
601 [[nodiscard]] constexpr data_iterator data(size_type i);
602
610 [[nodiscard]] constexpr value_iterator value_begin();
611
619 [[nodiscard]] constexpr value_iterator value_end();
620
628 [[nodiscard]] constexpr const_value_iterator value_cbegin() const;
629
637 [[nodiscard]] constexpr const_value_iterator value_cend() const;
638
650 [[nodiscard]] constexpr const_offset_iterator offset(size_type i) const;
651
659 [[nodiscard]] constexpr const_offset_iterator offsets_cbegin() const;
660
668 [[nodiscard]] constexpr const_offset_iterator offsets_cend() const;
669
681 [[nodiscard]] constexpr const_data_iterator data(size_type i) const;
682
683 // Modifiers
684
697 template <std::ranges::sized_range U>
698 requires mpl::convertible_ranges<U, T>
699 constexpr void resize_values(size_type new_length, U value);
700
710 constexpr void resize_offsets(size_type new_length, offset_type offset_value);
711
727 template <std::ranges::sized_range U>
728 requires mpl::convertible_ranges<U, T>
729 constexpr value_iterator insert_value(const_value_iterator pos, U value, size_type count);
730
743 constexpr offset_iterator insert_offset(const_offset_iterator pos, offset_type size, size_type count);
744
761 template <mpl::iterator_of_type<T> InputIt>
762 constexpr value_iterator insert_values(const_value_iterator pos, InputIt first, InputIt last);
763
784 template <mpl::iterator_of_type<OT> InputIt>
785 constexpr offset_iterator
786 insert_offsets(const_offset_iterator pos, InputIt first_sizes, InputIt last_sizes);
787
805 constexpr value_iterator erase_values(const_value_iterator pos, size_type count);
806
823 constexpr offset_iterator erase_offsets(const_offset_iterator pos, size_type count);
824
840 template <std::ranges::sized_range U>
841 requires mpl::convertible_ranges<U, T>
842 constexpr void assign(U&& rhs, size_type index);
843
846 friend base_type;
849 };
850
851 /*********************************************
852 * variable_size_binary_array_impl implementation *
853 *********************************************/
854
855 template <std::ranges::sized_range T, class CR, layout_offset OT>
857 : base_type(std::move(proxy))
858 {
859 const auto type = this->get_arrow_proxy().data_type();
862 || type == data_type::LARGE_BINARY
863 );
865 (((type == data_type::STRING || type == data_type::BINARY) && std::same_as<OT, int32_t>)
866 || ((type == data_type::LARGE_STRING || type == data_type::LARGE_BINARY)
867 && std::same_as<OT, int64_t>) )
868 );
869 }
870
871 template <std::ranges::sized_range T, class CR, layout_offset OT>
872 template <std::ranges::range SIZES_RANGE>
875 {
877 std::forward<SIZES_RANGE>(sizes)
878 );
879 }
880
881 template <std::ranges::sized_range T, class CR, layout_offset OT>
882 template <mpl::char_like C, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
883 arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy(
884 u8_buffer<C>&& data_buffer,
885 offset_buffer_type&& offsets,
886 VB&& validity_input,
887 std::optional<std::string_view> name,
888 std::optional<METADATA_RANGE> metadata
889 )
890 {
891 const auto size = offsets.size() - 1;
892 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
893 const auto null_count = vbitmap.null_count();
894
897 std::move(name), // name
898 std::move(metadata), // metadata
899 std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE}), // flags,
900 nullptr, // children
901 repeat_view<bool>(true, 0),
902 nullptr, // dictionary
903 true
904
905 );
906 std::vector<buffer<std::uint8_t>> arr_buffs = {
907 std::move(vbitmap).extract_storage(),
908 std::move(offsets).extract_storage(),
909 std::move(data_buffer).extract_storage()
910 };
911
912 ArrowArray arr = make_arrow_array(
913 static_cast<std::int64_t>(size), // length
914 static_cast<int64_t>(null_count),
915 0, // offset
916 std::move(arr_buffs),
917 nullptr, // children
918 repeat_view<bool>(true, 0),
919 nullptr, // dictionary
920 true
921 );
922 return arrow_proxy{std::move(arr), std::move(schema)};
923 }
924
925 template <std::ranges::sized_range T, class CR, layout_offset OT>
926 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
927 requires(
928 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
930 // range of char-like
931 )
932 arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy(
933 R&& values,
934 VB&& validity_input,
935 std::optional<std::string_view> name,
936 std::optional<METADATA_RANGE> metadata
937 )
938 {
939 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
940
941 auto size_range = values
942 | std::views::transform(
943 [](const auto& v)
944 {
945 return std::ranges::size(v);
946 }
947 );
948 auto offset_buffer = offset_from_sizes(size_range);
949 auto data_buffer = u8_buffer<values_inner_value_type>(std::ranges::views::join(values));
950 return create_proxy(
951 std::move(data_buffer),
952 std::move(offset_buffer),
953 std::forward<VB>(validity_input),
954 std::forward<std::optional<std::string_view>>(name),
955 std::forward<std::optional<METADATA_RANGE>>(metadata)
956 );
957 }
958
959 template <std::ranges::sized_range T, class CR, layout_offset OT>
960 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
961 requires std::is_same_v<std::ranges::range_value_t<R>, nullable<T>>
962 arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy(
963 R&& range,
964 std::optional<std::string_view> name,
965 std::optional<METADATA_RANGE> metadata
966 )
967 {
968 // split into values and is_non_null ranges
969 const auto values = range
970 | std::views::transform(
971 [](const auto& v)
972 {
973 return v.get();
974 }
975 );
976 const auto is_non_null = range
977 | std::views::transform(
978 [](const auto& v)
979 {
980 return v.has_value();
981 }
982 );
983 return self_type::create_proxy(values, is_non_null, std::move(name), std::move(metadata));
984 }
985
986 template <std::ranges::sized_range T, class CR, layout_offset OT>
987 template <
988 std::ranges::input_range R,
989 input_metadata_container METADATA_RANGE>
990 requires(
991 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
993 // range of
994 // char-like
995 )
996 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy(
997 R&& values,
998 bool nullable,
999 std::optional<std::string_view> name,
1000 std::optional<METADATA_RANGE> metadata
1001 )
1002 {
1003 using values_inner_value_type = std::ranges::range_value_t<std::ranges::range_value_t<R>>;
1004 const size_t size = std::ranges::size(values);
1005 u8_buffer<values_inner_value_type> data_buffer(std::ranges::views::join(values));
1006 auto size_range = values
1007 | std::views::transform(
1008 [](const auto& v)
1009 {
1010 return std::ranges::size(v);
1011 }
1012 );
1013 auto offset_buffer = offset_from_sizes(size_range);
1014 return create_proxy_impl(
1015 std::move(data_buffer),
1016 std::move(offset_buffer),
1017 nullable ? std::make_optional<validity_bitmap>(nullptr, size) : std::nullopt,
1018 std::move(name),
1019 std::move(metadata)
1020 );
1021 }
1022
1023 template <std::ranges::sized_range T, class CR, layout_offset OT>
1024 template <mpl::char_like C, input_metadata_container METADATA_RANGE>
1025 [[nodiscard]] arrow_proxy variable_size_binary_array_impl<T, CR, OT>::create_proxy_impl(
1026 u8_buffer<C>&& data_buffer,
1027 offset_buffer_type&& list_offsets,
1028 std::optional<validity_bitmap>&& bitmap,
1029 std::optional<std::string_view> name,
1030 std::optional<METADATA_RANGE> metadata
1031 )
1032 {
1033 const auto size = list_offsets.size() - 1;
1034 const auto null_count = bitmap.has_value() ? bitmap->null_count() : 0;
1035
1036 const std::optional<std::unordered_set<sparrow::ArrowFlag>>
1037 flags = bitmap.has_value()
1038 ? std::make_optional<std::unordered_set<sparrow::ArrowFlag>>({ArrowFlag::NULLABLE})
1039 : std::nullopt;
1040
1041 ArrowSchema schema = make_arrow_schema(
1043 std::move(name), // name
1044 std::move(metadata), // metadata
1045 flags, // flags,
1046 nullptr, // children
1047 repeat_view<bool>(true, 0),
1048 nullptr, // dictionary
1049 true
1050
1051 );
1052 std::vector<buffer<std::uint8_t>> arr_buffs = {
1053 bitmap.has_value() ? std::move(*bitmap).extract_storage() : buffer<std::uint8_t>{nullptr, 0},
1054 std::move(list_offsets).extract_storage(),
1055 std::move(data_buffer).extract_storage()
1056 };
1057
1058 ArrowArray arr = make_arrow_array(
1059 static_cast<std::int64_t>(size), // length
1060 static_cast<int64_t>(null_count),
1061 0, // offset
1062 std::move(arr_buffs),
1063 nullptr, // children
1064 repeat_view<bool>(true, 0),
1065 nullptr, // dictionary
1066 true
1067 );
1068 return arrow_proxy{std::move(arr), std::move(schema)};
1069 }
1070
1071 template <std::ranges::sized_range T, class CR, layout_offset OT>
1072 constexpr auto variable_size_binary_array_impl<T, CR, OT>::data(size_type i) -> data_iterator
1073 {
1074 arrow_proxy& proxy = get_arrow_proxy();
1075 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
1076 return proxy.buffers()[DATA_BUFFER_INDEX].template data<data_value_type>() + i;
1077 }
1078
1079 template <std::ranges::sized_range T, class CR, layout_offset OT>
1080 constexpr auto variable_size_binary_array_impl<T, CR, OT>::data(size_type i) const -> const_data_iterator
1081 {
1082 const arrow_proxy& proxy = this->get_arrow_proxy();
1083 SPARROW_ASSERT_TRUE(proxy.buffers()[DATA_BUFFER_INDEX].size() >= i);
1084 return proxy.buffers()[DATA_BUFFER_INDEX].template data<const data_value_type>() + i;
1085 }
1086
1087 template <std::ranges::sized_range T, class CR, layout_offset OT>
1088 template <std::ranges::sized_range U>
1090 constexpr void variable_size_binary_array_impl<T, CR, OT>::assign(U&& rhs, size_type index)
1091 {
1092 SPARROW_ASSERT_TRUE(index < size());
1093 const auto offset_beg = *offset(index);
1094 const auto offset_end = *offset(index + 1);
1095 const auto initial_value_length = offset_end - offset_beg;
1096 const auto new_value_length = static_cast<OT>(std::ranges::size(rhs));
1097 const OT shift_byte_count = new_value_length - initial_value_length;
1098 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1099 if (shift_byte_count != 0)
1100 {
1101 const auto shift_val_abs = static_cast<size_t>(std::abs(shift_byte_count));
1102 const auto new_data_buffer_size = shift_byte_count < 0 ? data_buffer.size() - shift_val_abs
1103 : data_buffer.size() + shift_val_abs;
1104
1105 if (shift_byte_count > 0)
1106 {
1107 data_buffer.resize(new_data_buffer_size);
1108 // Move elements to make space for the new value
1109 std::move_backward(
1110 sparrow::next(data_buffer.begin(), offset_end),
1111 sparrow::next(data_buffer.end(), -shift_byte_count),
1112 data_buffer.end()
1113 );
1114 }
1115 else
1116 {
1117 std::move(
1118 sparrow::next(data_buffer.begin(), offset_end),
1119 data_buffer.end(),
1120 sparrow::next(data_buffer.begin(), offset_end + shift_byte_count)
1121 );
1122 data_buffer.resize(new_data_buffer_size);
1123 }
1124 // Adjust offsets for subsequent elements
1125 std::for_each(
1126 offset(index + 1),
1127 offset(size() + 1),
1128 [shift_byte_count](auto& offset)
1129 {
1130 offset += shift_byte_count;
1131 }
1132 );
1133 }
1134 auto tmp = std::views::transform(
1135 rhs,
1136 [](const auto& val)
1137 {
1138 return static_cast<std::uint8_t>(val);
1139 }
1140 );
1141 // Copy the new value into the buffer
1142 std::copy(std::ranges::begin(tmp), std::ranges::end(tmp), sparrow::next(data_buffer.begin(), offset_beg));
1143 }
1144
1145 template <std::ranges::sized_range T, class CR, layout_offset OT>
1146 constexpr auto variable_size_binary_array_impl<T, CR, OT>::offset(size_type i) -> offset_iterator
1147 {
1148 SPARROW_ASSERT_TRUE(i <= size() + this->get_arrow_proxy().offset());
1149 return get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
1150 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
1151 }
1152
1153 template <std::ranges::sized_range T, class CR, layout_offset OT>
1154 constexpr auto variable_size_binary_array_impl<T, CR, OT>::offset(size_type i) const
1155 -> const_offset_iterator
1156 {
1157 SPARROW_ASSERT_TRUE(i <= this->size() + this->get_arrow_proxy().offset());
1158 return this->get_arrow_proxy().buffers()[OFFSET_BUFFER_INDEX].template data<OT>()
1159 + static_cast<size_type>(this->get_arrow_proxy().offset()) + i;
1160 }
1161
1162 template <std::ranges::sized_range T, class CR, layout_offset OT>
1163 constexpr auto variable_size_binary_array_impl<T, CR, OT>::offsets_begin() -> offset_iterator
1164 {
1165 return offset(0);
1166 }
1167
1168 template <std::ranges::sized_range T, class CR, layout_offset OT>
1169 constexpr auto variable_size_binary_array_impl<T, CR, OT>::offsets_cbegin() const -> const_offset_iterator
1170 {
1171 return offset(0);
1172 }
1173
1174 template <std::ranges::sized_range T, class CR, layout_offset OT>
1175 constexpr auto variable_size_binary_array_impl<T, CR, OT>::offsets_end() -> offset_iterator
1176 {
1177 return offset(size() + 1);
1178 }
1179
1180 template <std::ranges::sized_range T, class CR, layout_offset OT>
1181 constexpr auto variable_size_binary_array_impl<T, CR, OT>::offsets_cend() const -> const_offset_iterator
1182 {
1183 return offset(size() + 1);
1184 }
1185
1186 template <std::ranges::sized_range T, class CR, layout_offset OT>
1188 {
1189 SPARROW_ASSERT_TRUE(i < size());
1190 return inner_reference(this, i);
1191 }
1192
1193 template <std::ranges::sized_range T, class CR, layout_offset OT>
1195 {
1196 SPARROW_ASSERT_TRUE(i < this->size());
1197 const OT offset_begin = *offset(i);
1198 SPARROW_ASSERT_TRUE(offset_begin >= 0);
1199 const OT offset_end = *offset(i + 1);
1200 SPARROW_ASSERT_TRUE(offset_end >= 0);
1201 const const_data_iterator pointer_begin = data(static_cast<size_t>(offset_begin));
1202 const const_data_iterator pointer_end = data(static_cast<size_t>(offset_end));
1203 return inner_const_reference(pointer_begin, pointer_end);
1204 }
1205
1206 template <std::ranges::sized_range T, class CR, layout_offset OT>
1207 constexpr auto variable_size_binary_array_impl<T, CR, OT>::value_begin() -> value_iterator
1208 {
1209 return value_iterator{this, 0};
1210 }
1211
1212 template <std::ranges::sized_range T, class CR, layout_offset OT>
1213 constexpr auto variable_size_binary_array_impl<T, CR, OT>::value_end() -> value_iterator
1214 {
1215 return sparrow::next(value_begin(), size());
1216 }
1217
1218 template <std::ranges::sized_range T, class CR, layout_offset OT>
1219 constexpr auto variable_size_binary_array_impl<T, CR, OT>::value_cbegin() const -> const_value_iterator
1220 {
1221 return const_value_iterator{this, 0};
1222 }
1223
1224 template <std::ranges::sized_range T, class CR, layout_offset OT>
1225 constexpr auto variable_size_binary_array_impl<T, CR, OT>::value_cend() const -> const_value_iterator
1226 {
1227 return sparrow::next(value_cbegin(), this->size());
1228 }
1229
1230 template <std::ranges::sized_range T, class CR, layout_offset OT>
1231 template <std::ranges::sized_range U>
1233 constexpr void variable_size_binary_array_impl<T, CR, OT>::resize_values(size_type new_length, U value)
1234 {
1235 const size_t new_size = new_length + static_cast<size_t>(this->get_arrow_proxy().offset());
1236 auto& buffers = this->get_arrow_proxy().get_array_private_data()->buffers();
1237 if (new_length < size())
1238 {
1239 const auto offset_begin = static_cast<size_t>(*offset(new_length));
1240 auto& data_buffer = buffers[DATA_BUFFER_INDEX];
1241 data_buffer.resize(offset_begin);
1242 auto& offset_buffer = buffers[OFFSET_BUFFER_INDEX];
1243 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1244 offset_buffer_adaptor.resize(new_size + 1);
1245 }
1246 else if (new_length > size())
1247 {
1248 insert_value(value_cend(), value, new_length - size());
1249 }
1250 }
1251
1252 template <std::ranges::sized_range T, class CR, layout_offset OT>
1253 template <std::ranges::sized_range U>
1255 constexpr auto
1256 variable_size_binary_array_impl<T, CR, OT>::insert_value(const_value_iterator pos, U value, size_type count)
1258 {
1259 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
1260 const OT offset_begin = *offset(idx);
1261 const std::vector<uint8_t> casted_value{value.cbegin(), value.cend()};
1262 const repeat_view<std::vector<uint8_t>> my_repeat_view{casted_value, count};
1263 const auto joined_repeated_value_range = std::ranges::views::join(my_repeat_view);
1264 auto& data_buffer = this->get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1265 const auto pos_to_insert = sparrow::next(data_buffer.cbegin(), offset_begin);
1266 data_buffer.insert(pos_to_insert, joined_repeated_value_range.begin(), joined_repeated_value_range.end());
1267 insert_offset(offsets_cbegin() + idx + 1, static_cast<offset_type>(value.size()), count);
1268 return sparrow::next(value_begin(), idx);
1269 }
1270
1271 template <std::ranges::sized_range T, class CR, layout_offset OT>
1272 constexpr auto variable_size_binary_array_impl<T, CR, OT>::insert_offset(
1273 const_offset_iterator pos,
1274 offset_type value_size,
1275 size_type count
1276 ) -> offset_iterator
1277 {
1278 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1279 const auto idx = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
1280 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1281 const offset_type cumulative_size = value_size * static_cast<offset_type>(count);
1282 // Adjust offsets for subsequent elements
1283 std::for_each(
1284 sparrow::next(offset_buffer_adaptor.begin(), idx + 1),
1285 offset_buffer_adaptor.end(),
1286 [cumulative_size](auto& offset)
1287 {
1288 offset += cumulative_size;
1289 }
1290 );
1291 offset_buffer_adaptor.insert(sparrow::next(offset_buffer_adaptor.cbegin(), idx + 1), count, 0);
1292 // Put the right values in the new offsets
1293 for (size_t i = idx + 1; i < idx + 1 + count; ++i)
1294 {
1295 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + value_size;
1296 }
1297 return offsets_begin() + idx;
1298 }
1299
1300 template <std::ranges::sized_range T, class CR, layout_offset OT>
1301 template <mpl::iterator_of_type<T> InputIt>
1302 constexpr auto
1303 variable_size_binary_array_impl<T, CR, OT>::insert_values(const_value_iterator pos, InputIt first, InputIt last)
1305 {
1306 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1307 auto data_buffer_adaptor = make_buffer_adaptor<data_value_type>(data_buffer);
1308 auto values = std::ranges::subrange(first, last);
1309 const size_t cumulative_sizes = std::accumulate(
1310 values.begin(),
1311 values.end(),
1312 size_t(0),
1313 [](size_t acc, const T& value)
1314 {
1315 return acc + value.size();
1316 }
1317 );
1318 data_buffer_adaptor.resize(data_buffer_adaptor.size() + cumulative_sizes);
1319 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
1320 const OT offset_begin = *offset(idx);
1321 auto insert_pos = sparrow::next(data_buffer_adaptor.begin(), offset_begin);
1322
1323 // Move elements to make space for the new value
1324 std::move_backward(
1325 insert_pos,
1326 sparrow::next(data_buffer_adaptor.end(), -static_cast<difference_type>(cumulative_sizes)),
1327 data_buffer_adaptor.end()
1328 );
1329
1330 for (const T& value : values)
1331 {
1332 std::copy(value.begin(), value.end(), insert_pos);
1333 std::advance(insert_pos, value.size());
1334 }
1335
1336 const auto sizes_of_each_value = std::ranges::views::transform(
1337 values,
1338 [](const T& value) -> offset_type
1339 {
1340 return static_cast<offset_type>(value.size());
1341 }
1342 );
1343 insert_offsets(offset(idx + 1), sizes_of_each_value.begin(), sizes_of_each_value.end());
1344 return sparrow::next(value_begin(), idx);
1345 }
1346
1347 template <std::ranges::sized_range T, class CR, layout_offset OT>
1348 template <mpl::iterator_of_type<OT> InputIt>
1349 constexpr auto variable_size_binary_array_impl<T, CR, OT>::insert_offsets(
1351 InputIt first_sizes,
1352 InputIt last_sizes
1353 ) -> offset_iterator
1354 {
1355 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
1356 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
1357 SPARROW_ASSERT_TRUE(first_sizes <= last_sizes);
1358 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1359 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1360 const auto idx = std::distance(offsets_cbegin(), pos);
1361 const OT cumulative_sizes = std::reduce(first_sizes, last_sizes, OT(0));
1362 const auto sizes_count = std::distance(first_sizes, last_sizes);
1363 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() + static_cast<size_t>(sizes_count));
1364 // Move the offsets to make space for the new offsets
1365 std::move_backward(
1366 offset_buffer_adaptor.begin() + idx,
1367 offset_buffer_adaptor.end() - sizes_count,
1368 offset_buffer_adaptor.end()
1369 );
1370 // Adjust offsets for subsequent elements
1371 std::for_each(
1372 offset_buffer_adaptor.begin() + idx + sizes_count,
1373 offset_buffer_adaptor.end(),
1374 [cumulative_sizes](auto& offset)
1375 {
1376 offset += cumulative_sizes;
1377 }
1378 );
1379 // Put the right values in the new offsets
1380 InputIt it = first_sizes;
1381 for (size_t i = static_cast<size_t>(idx + 1); i < static_cast<size_t>(idx + sizes_count + 1); ++i)
1382 {
1383 offset_buffer_adaptor[i] = offset_buffer_adaptor[i - 1] + *it;
1384 ++it;
1385 }
1386 return offset(static_cast<size_t>(idx));
1387 }
1388
1389 template <std::ranges::sized_range T, class CR, layout_offset OT>
1390 constexpr auto
1391 variable_size_binary_array_impl<T, CR, OT>::erase_values(const_value_iterator pos, size_type count)
1392 -> value_iterator
1393 {
1394 SPARROW_ASSERT_TRUE(pos >= value_cbegin());
1395 SPARROW_ASSERT_TRUE(pos <= value_cend());
1396 const size_t index = static_cast<size_t>(std::distance(value_cbegin(), pos));
1397 if (count == 0)
1398 {
1399 return sparrow::next(value_begin(), index);
1400 }
1401 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
1402 const auto offset_begin = *offset(index);
1403 const auto offset_end = *offset(index + count);
1404 const size_t difference = static_cast<size_t>(offset_end - offset_begin);
1405 // move the values after the erased ones
1406 std::move(data_buffer.begin() + offset_end, data_buffer.end(), data_buffer.begin() + offset_begin);
1407 data_buffer.resize(data_buffer.size() - difference);
1408 // adjust the offsets for the subsequent elements
1409 erase_offsets(offset(index), count);
1410 return sparrow::next(value_begin(), index);
1411 }
1412
1413 template <std::ranges::sized_range T, class CR, layout_offset OT>
1414 constexpr auto
1415 variable_size_binary_array_impl<T, CR, OT>::erase_offsets(const_offset_iterator pos, size_type count)
1416 -> offset_iterator
1417 {
1418 SPARROW_ASSERT_TRUE(pos >= offsets_cbegin());
1419 SPARROW_ASSERT_TRUE(pos <= offsets_cend());
1420 const size_t index = static_cast<size_t>(std::distance(offsets_cbegin(), pos));
1421 if (count == 0)
1422 {
1423 return offset(index);
1424 }
1425 auto& offset_buffer = get_arrow_proxy().get_array_private_data()->buffers()[OFFSET_BUFFER_INDEX];
1426 auto offset_buffer_adaptor = make_buffer_adaptor<OT>(offset_buffer);
1427 const OT offset_start_value = *offset(index);
1428 const OT offset_end_value = *offset(index + count);
1429 const OT difference = offset_end_value - offset_start_value;
1430 // move the offsets after the erased ones
1431 std::move(
1432 sparrow::next(offset_buffer_adaptor.begin(), index + count + 1),
1433 offset_buffer_adaptor.end(),
1434 sparrow::next(offset_buffer_adaptor.begin(), index + 1)
1435 );
1436 offset_buffer_adaptor.resize(offset_buffer_adaptor.size() - count);
1437 // adjust the offsets for the subsequent elements
1438 std::for_each(
1439 sparrow::next(offset_buffer_adaptor.begin(), index + 1),
1440 offset_buffer_adaptor.end(),
1441 [difference](OT& offset)
1442 {
1443 offset -= difference;
1444 }
1445 );
1446 return offset(index);
1447 }
1448
1449}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::iterator_tag iterator_tag
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
Object that owns a piece of contiguous memory.
Definition buffer.hpp:112
constexpr size_type null_count() const noexcept
Returns the number of bits set to false (null/invalid).
bitset_iterator< self_type, true > const_iterator
A view that repeats a value a given number of times.
This buffer class is used as storage buffer for all sparrow arrays.
constexpr inner_const_reference value(size_type i) const
Gets const reference to element at specified index.
variable_size_binary_array_impl(ARGS &&... args)
Generic constructor for creating array from various inputs.
variable_size_binary_array_impl(arrow_proxy)
Constructs array from Arrow proxy.
constexpr inner_reference value(size_type i)
Gets mutable reference to element at specified index.
static constexpr auto offset_from_sizes(SIZES_RANGE &&sizes) -> offset_buffer_type
Creates offset buffer from a range of sizes.
Iterator over the data values of a variable size binary layout.
Concept for input containers that can provide metadata pairs.
Definition metadata.hpp:304
Concept for iterator types.
Concept for character-like types.
Concept for convertible range types.
Definition mp_utils.hpp:931
Concept defining valid input types for validity bitmap creation.
#define SPARROW_CONSTEXPR_GCC_11
Definition config.hpp:50
#define SPARROW_ASSERT_TRUE(expr__)
constexpr sparrow::u8_buffer< OFFSET_TYPE > offset_buffer_from_sizes(SIZES_RANGE &&sizes)
constexpr std::size_t size(typelist< T... >={})
Gets the count of types contained in a typelist.
Definition mp_utils.hpp:216
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
binary_array_impl< std::int64_t > big_binary_array
Type alias for variable-size binary arrays with 64-bit offsets.
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient alias for arrays with mutable validity bitmaps.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
variable_size_binary_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference, OT > binary_array_impl
constexpr bool is_big_string_array_v
Checks whether T is a big_string_array type.
constexpr bool is_string_array_v
Checks whether T is a string_array type.
variable_size_binary_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference, OT > string_array_impl
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
string_array_impl< std::int64_t > big_string_array
Type alias for variable-size string arrays with 64-bit offsets.
string_array_impl< std::int32_t > string_array
Type alias for variable-size string arrays with 32-bit offsets.
constexpr bool is_big_binary_array_v
Checks whether T is a big_binary_array type.
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
Type alias for a validity bitmap using 8-bit storage blocks.
constexpr bool is_binary_array_v
Checks whether T is a binary_array type.
binary_array_impl< std::int32_t > binary_array
Type alias for variable-size binary arrays with 32-bit offsets.
auto make_buffer_adaptor(FromBufferRef &buf)
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
Ensures a validity bitmap of the specified size from various input types.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
array_inner_types< variable_size_binary_array_impl< T, CR, OT > >::iterator_tag iterator_tag
variable_size_binary_value_iterator< array_type, iterator_types > value_iterator
variable_size_binary_value_iterator< array_type, const_iterator_types > const_value_iterator
Base class for array_inner_types specializations.
Traits class that must be specialized by array implementations.
Provides compile-time information about Arrow data types.
Metafunction for retrieving the data_type of a typed array.