sparrow 0.3.0
Loading...
Searching...
No Matches
variable_size_binary_view_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstddef>
18#include <ranges>
19
32
33namespace sparrow
34{
35
36
37 template <class T>
41
42 namespace detail
43 {
44 template <class T>
45 struct get_data_type_from_array;
46
47 template <>
49 {
50 [[nodiscard]] static constexpr sparrow::data_type get()
51 {
53 }
54 };
55
56 template <>
58 {
59 [[nodiscard]] static constexpr sparrow::data_type get()
60 {
62 }
63 };
64 }
65
66 template <class T>
79
80 template <class T>
82 {
83 };
84
85 template <class T>
89
93 template <class T>
95
96 template <class T>
98 : public mutable_array_bitmap_base<variable_size_binary_view_array_impl<T>>
99 {
100 public:
101
104
106 using inner_value_type = typename inner_types::inner_value_type;
107 using inner_reference = typename inner_types::inner_reference;
108 using inner_const_reference = typename inner_types::inner_const_reference;
109
111 using bitmap_reference = typename base_type::bitmap_reference;
115 using bitmap_range = typename base_type::bitmap_range;
117
121
125
126 using value_iterator = typename base_type::value_iterator;
127 using const_value_iterator = typename base_type::const_value_iterator;
128
129 using iterator = typename base_type::iterator;
130 using const_iterator = typename base_type::const_iterator;
131
133
134 template <class... Args>
137 : variable_size_binary_view_array_impl(create_proxy(std::forward<Args>(args)...))
138 {
139 }
140
141 private:
142
143 template <std::ranges::input_range R, validity_bitmap_input VB = validity_bitmap>
144 requires std::convertible_to<std::ranges::range_value_t<R>, T>
145 [[nodiscard]] static arrow_proxy create_proxy(
146 R&& range,
147 VB&& bitmap_input = validity_bitmap{},
148 std::optional<std::string_view> name = std::nullopt,
149 std::optional<std::string_view> metadata = std::nullopt
150 );
151
152 [[nodiscard]] inner_reference value(size_type i);
153 [[nodiscard]] inner_const_reference value(size_type i) const;
154
155 [[nodiscard]] value_iterator value_begin();
156 [[nodiscard]] value_iterator value_end();
157
158 [[nodiscard]] const_value_iterator value_cbegin() const;
159 [[nodiscard]] const_value_iterator value_cend() const;
160
161 static constexpr size_type LENGTH_BUFFER_INDEX = 1;
162 static constexpr std::size_t DATA_BUFFER_SIZE = 16;
163 static constexpr std::size_t SHORT_STRING_SIZE = 12;
164 static constexpr std::size_t PREFIX_SIZE = 4;
165 static constexpr std::ptrdiff_t PREFIX_OFFSET = 4;
166 static constexpr std::ptrdiff_t SHORT_STRING_OFFSET = 4;
167 static constexpr std::ptrdiff_t BUFFER_INDEX_OFFSET = 8;
168 static constexpr std::ptrdiff_t BUFFER_OFFSET_OFFSET = 12;
169 static constexpr std::size_t FIRST_VAR_DATA_BUFFER_INDEX = 2;
170
171 friend base_type;
173 };
174
175 template <class T>
180
181 template <class T>
182 template <std::ranges::input_range R, validity_bitmap_input VB>
183 requires std::convertible_to<std::ranges::range_value_t<R>, T>
184 arrow_proxy variable_size_binary_view_array_impl<T>::create_proxy(
185 R&& range,
186 VB&& validity_input,
187 std::optional<std::string_view> name,
188 std::optional<std::string_view> metadata
189 )
190 {
191#ifdef __GNUC__
192# pragma GCC diagnostic push
193# pragma GCC diagnostic ignored "-Wcast-align"
194#endif
195
196 const auto size = range_size(range);
197 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
198 const auto null_count = vbitmap.null_count();
199
200 buffer<uint8_t> length_buffer(size * DATA_BUFFER_SIZE);
201
202 std::size_t long_string_storage_size = 0;
203 std::size_t i = 0;
204 for (auto&& val : range)
205 {
206 auto val_casted = val
207 | std::ranges::views::transform(
208 [](const auto& v)
209 {
210 return static_cast<std::uint8_t>(v);
211 }
212 );
213
214 const auto length = val.size();
215 auto length_ptr = length_buffer.data() + (i * DATA_BUFFER_SIZE);
216
217 // write length
218 *reinterpret_cast<std::int32_t*>(length_ptr) = static_cast<std::int32_t>(length);
219
220 if (length <= SHORT_STRING_SIZE)
221 {
222 // write data itself
223 std::ranges::copy(val_casted, length_ptr + SHORT_STRING_OFFSET);
224 }
225 else
226 {
227 // write the prefix of the data
228 auto prefix_sub_range = val_casted | std::ranges::views::take(PREFIX_SIZE);
229 std::ranges::copy(prefix_sub_range, length_ptr + PREFIX_OFFSET);
230
231 // write the buffer index
232 *reinterpret_cast<std::int32_t*>(
233 length_ptr + BUFFER_INDEX_OFFSET
234 ) = static_cast<std::int32_t>(FIRST_VAR_DATA_BUFFER_INDEX);
235
236 // write the buffer offset
237 *reinterpret_cast<std::int32_t*>(
238 length_ptr + BUFFER_OFFSET_OFFSET
239 ) = static_cast<std::int32_t>(long_string_storage_size);
240
241 // count the size of the long string storage
242 long_string_storage_size += length;
243 }
244 ++i;
245 }
246
247 // write the long string storage
248 buffer<uint8_t> long_string_storage(long_string_storage_size);
249 std::size_t long_string_storage_offset = 0;
250 for (auto&& val : range)
251 {
252 const auto length = val.size();
253 if (length > SHORT_STRING_SIZE)
254 {
255 auto val_casted = val
256 | std::ranges::views::transform(
257 [](const auto& v)
258 {
259 return static_cast<std::uint8_t>(v);
260 }
261 );
262
263 std::ranges::copy(val_casted, long_string_storage.data() + long_string_storage_offset);
264 long_string_storage_offset += length;
265 }
266 }
267
268 // For binary or utf-8 view arrays, an extra buffer is appended which stores
269 // the lengths of each variadic data buffer as int64_t.
270 // This buffer is necessary since these buffer lengths are not trivially
271 // extractable from other data in an array of binary or utf-8 view type.
272 u8_buffer<int64_t> buffer_sizes(
273 static_cast<std::size_t>(1),
274 static_cast<int64_t>(long_string_storage_size)
275 );
276
277 // create arrow schema and array
278 ArrowSchema schema = make_arrow_schema(
279 std::is_same<T, std::string_view>::value ? std::string_view("vu") : std::string_view("vz"),
280 std::move(name), // name
281 std::move(metadata), // metadata
282 std::nullopt, // flags
283 0, // n_children
284 nullptr, // children
285 nullptr // dictionary
286 );
287
288 std::vector<buffer<uint8_t>> buffers{
289 std::move(vbitmap).extract_storage(),
290 std::move(length_buffer),
291 std::move(long_string_storage),
292 std::move(buffer_sizes).extract_storage()
293 };
294
295 // create arrow array
296 ArrowArray arr = make_arrow_array(
297 static_cast<std::int64_t>(size), // length
298 static_cast<int64_t>(null_count),
299 0, // offset
300 std::move(buffers),
301 0, // n_children
302 nullptr, // children
303 nullptr // dictionary
304 );
305
306 return arrow_proxy{std::move(arr), std::move(schema)};
307
308#ifdef __GNUC__
309# pragma GCC diagnostic pop
310#endif
311 }
312
313 template <class T>
314 auto variable_size_binary_view_array_impl<T>::value(size_type i) -> inner_reference
315 {
316 return static_cast<const self_type*>(this)->value(i);
317 }
318
319 template <class T>
320 auto variable_size_binary_view_array_impl<T>::value(size_type i) const -> inner_const_reference
321 {
322#ifdef __GNUC__
323# pragma GCC diagnostic push
324# pragma GCC diagnostic ignored "-Wcast-align"
325#endif
326
327 SPARROW_ASSERT_TRUE(i < this->size());
328
329 constexpr std::size_t element_size = 16;
330 auto data_ptr = this->get_arrow_proxy().buffers()[LENGTH_BUFFER_INDEX].template data<uint8_t>()
331 + (i * element_size);
332
333 auto length = static_cast<std::size_t>(*reinterpret_cast<const std::int32_t*>(data_ptr));
334 using char_or_byte = typename inner_const_reference::value_type;
335
336 if (length <= 12)
337 {
338 constexpr std::ptrdiff_t data_offset = 4;
339 auto ptr = reinterpret_cast<const char_or_byte*>(data_ptr);
340 const auto ret = inner_const_reference(ptr + data_offset, length);
341 return ret;
342 }
343 else
344 {
345 constexpr std::ptrdiff_t buffer_index_offset = 8;
346 constexpr std::ptrdiff_t buffer_offset_offset = 12;
347 auto buffer_index = static_cast<std::size_t>(
348 *reinterpret_cast<const std::int32_t*>(data_ptr + buffer_index_offset)
349 );
350 auto buffer_offset = static_cast<std::size_t>(
351 *reinterpret_cast<const std::int32_t*>(data_ptr + buffer_offset_offset)
352 );
353 auto buffer = this->get_arrow_proxy().buffers()[buffer_index].template data<const char_or_byte>();
354 return inner_const_reference(buffer + buffer_offset, length);
355 }
356
357#ifdef __GNUC__
358# pragma GCC diagnostic pop
359#endif
360 }
361
362 template <class T>
363 auto variable_size_binary_view_array_impl<T>::value_begin() -> value_iterator
364 {
366 }
367
368 template <class T>
369 auto variable_size_binary_view_array_impl<T>::value_end() -> value_iterator
370 {
371 return value_iterator(detail::layout_value_functor<self_type, inner_value_type>(this), this->size());
372 }
373
374 template <class T>
375 auto variable_size_binary_view_array_impl<T>::value_cbegin() const -> const_value_iterator
376 {
377 return const_value_iterator(detail::layout_value_functor<const self_type, inner_value_type>(this), 0);
378 }
379
380 template <class T>
381 auto variable_size_binary_view_array_impl<T>::value_cend() const -> const_value_iterator
382 {
383 return const_value_iterator(
385 this->size()
386 );
387 }
388}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::bitmap_iterator bitmap_iterator
typename base_type::iterator_tag iterator_tag
typename base_type::const_bitmap_iterator const_bitmap_iterator
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
Proxy class over ArrowArray and ArrowSchema.
Object that owns a piece of contiguous memory.
Definition buffer.hpp:109
constexpr size_type null_count() const noexcept
The nullable class models a value or a reference that can be "null", or missing, like values traditio...
Definition nullable.hpp:280
nullable< inner_const_reference, bitmap_const_reference > const_reference
variable_size_binary_view_array_impl< std::string_view > self_type
#define SPARROW_ASSERT_TRUE(expr__)
constexpr std::size_t size(typelist< T... >={})
Definition mp_utils.hpp:107
constexpr bool excludes_copy_and_move_ctor_v
Definition mp_utils.hpp:507
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient typedef to be used as a crtp base class for arrays using a mutable validity buffer.
constexpr bool is_variable_size_binary_view_array
Checks whether T is a variable_size_binary_view_array_impl type.
ArrowSchema make_arrow_schema(F format, N name, M metadata, std::optional< ArrowFlag > flags, int64_t n_children, ArrowSchema **children, ArrowSchema *dictionary)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
variable_size_binary_view_array_impl< std::span< const std::byte > > binary_view_array
variable_size_binary_view_array_impl< std::string_view > string_view_array
dynamic_bitset< std::uint8_t > validity_bitmap
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
std::size_t range_size(R &&r)
Definition ranges.hpp:31
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, size_t n_children, ArrowArray **children, ArrowArray *dictionary)
Creates an ArrowArray.
functor_index_iterator< detail::layout_value_functor< array_type, inner_value_type > > value_iterator
functor_index_iterator< detail::layout_value_functor< const array_type, inner_reference > > const_value_iterator
Base class for array_inner_types specialization.
Traits class that must be specialized by array classes inheriting from array_crtp_base.