sparrow 0.9.0
Loading...
Searching...
No Matches
variable_size_binary_view_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstddef>
18#include <ranges>
19
35
36namespace sparrow
37{
38 template <typename T>
39 concept variable_size_binary_view_impl_types = std::is_same_v<T, std::string_view>
40 || std::is_same_v<T, sequence_view<const std::byte>>;
41
42
43 template <variable_size_binary_view_impl_types T>
45
54
63
64 namespace detail
65 {
66 template <>
68 {
69 [[nodiscard]] static constexpr sparrow::data_type get()
70 {
72 }
73 };
74
75 template <>
77 {
78 [[nodiscard]] static constexpr sparrow::data_type get()
79 {
81 }
82 };
83 }
84
85 template <variable_size_binary_view_impl_types T>
98
99 template <variable_size_binary_view_impl_types T>
101 {
102 };
103
104 template <variable_size_binary_view_impl_types T>
108
112 template <variable_size_binary_view_impl_types T>
114
115 template <variable_size_binary_view_impl_types T>
117 : public mutable_array_bitmap_base<variable_size_binary_view_array_impl<T>>
118 {
119 public:
120
123
125 using inner_value_type = typename inner_types::inner_value_type;
126 using inner_reference = typename inner_types::inner_reference;
127 using inner_const_reference = typename inner_types::inner_const_reference;
128
130 using bitmap_reference = typename base_type::bitmap_reference;
134 using bitmap_range = typename base_type::bitmap_range;
136
140
144
145 using value_iterator = typename base_type::value_iterator;
146 using const_value_iterator = typename base_type::const_value_iterator;
147
148 using iterator = typename base_type::iterator;
149 using const_iterator = typename base_type::const_iterator;
150
152
153 template <class... Args>
156 : variable_size_binary_view_array_impl(create_proxy(std::forward<Args>(args)...))
157 {
158 }
159
160 private:
161
162 struct buffers
163 {
164 buffer<uint8_t> length_buffer;
165 buffer<uint8_t> long_string_storage;
166 u8_buffer<int64_t> buffer_sizes;
167 };
168
169 template <std::ranges::input_range R>
170 requires std::convertible_to<std::ranges::range_value_t<R>, T>
171 static buffers create_buffers(R&& range);
172
173 template <std::ranges::input_range R, validity_bitmap_input VB = validity_bitmap, input_metadata_container METADATA_RANGE>
174 requires std::convertible_to<std::ranges::range_value_t<R>, T>
175 [[nodiscard]] static arrow_proxy create_proxy(
176 R&& range,
177 VB&& bitmap_input = validity_bitmap{},
178 std::optional<std::string_view> name = std::nullopt,
179 std::optional<METADATA_RANGE> metadata = std::nullopt
180 );
181
182 template <std::ranges::input_range NULLABLE_RANGE, input_metadata_container METADATA_RANGE>
183 requires std::convertible_to<std::ranges::range_value_t<NULLABLE_RANGE>, nullable<T>>
184 [[nodiscard]] static arrow_proxy create_proxy(
185 NULLABLE_RANGE&& nullable_range,
186 std::optional<std::string_view> name = std::nullopt,
187 std::optional<METADATA_RANGE> metadata = std::nullopt
188 );
189
190 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
191 requires std::convertible_to<std::ranges::range_value_t<R>, T>
192 [[nodiscard]] static arrow_proxy create_proxy(
193 R&& range,
194 bool = true,
195 std::optional<std::string_view> name = std::nullopt,
196 std::optional<METADATA_RANGE> metadata = std::nullopt
197 );
198
199 [[nodiscard]] constexpr inner_reference value(size_type i);
200 [[nodiscard]] constexpr inner_const_reference value(size_type i) const;
201
202 [[nodiscard]] constexpr value_iterator value_begin();
203 [[nodiscard]] constexpr value_iterator value_end();
204
205 [[nodiscard]] constexpr const_value_iterator value_cbegin() const;
206 [[nodiscard]] constexpr const_value_iterator value_cend() const;
207
208 static constexpr size_type LENGTH_BUFFER_INDEX = 1;
209 static constexpr std::size_t DATA_BUFFER_SIZE = 16;
210 static constexpr std::size_t SHORT_STRING_SIZE = 12;
211 static constexpr std::size_t PREFIX_SIZE = 4;
212 static constexpr std::ptrdiff_t PREFIX_OFFSET = 4;
213 static constexpr std::ptrdiff_t SHORT_STRING_OFFSET = 4;
214 static constexpr std::ptrdiff_t BUFFER_INDEX_OFFSET = 8;
215 static constexpr std::ptrdiff_t BUFFER_OFFSET_OFFSET = 12;
216 static constexpr std::size_t FIRST_VAR_DATA_BUFFER_INDEX = 2;
217
218 friend base_type;
222 };
223
224 template <variable_size_binary_view_impl_types T>
229
230 template <variable_size_binary_view_impl_types T>
231 template <std::ranges::input_range R>
232 requires std::convertible_to<std::ranges::range_value_t<R>, T>
233 auto variable_size_binary_view_array_impl<T>::create_buffers(R&& range) -> buffers
234 {
235#ifdef __GNUC__
236# pragma GCC diagnostic push
237# pragma GCC diagnostic ignored "-Wcast-align"
238#endif
239
240 const auto size = range_size(range);
241 buffer<uint8_t> length_buffer(size * DATA_BUFFER_SIZE);
242
243 std::size_t long_string_storage_size = 0;
244 std::size_t i = 0;
245 for (auto&& val : range)
246 {
247 auto val_casted = val
248 | std::ranges::views::transform(
249 [](const auto& v)
250 {
251 return static_cast<std::uint8_t>(v);
252 }
253 );
254
255 const auto length = val.size();
256 auto length_ptr = length_buffer.data() + (i * DATA_BUFFER_SIZE);
257
258 // write length
259 *reinterpret_cast<std::int32_t*>(length_ptr) = static_cast<std::int32_t>(length);
260
261 if (length <= SHORT_STRING_SIZE)
262 {
263 // write data itself
264 sparrow::ranges::copy(val_casted, length_ptr + SHORT_STRING_OFFSET);
265 }
266 else
267 {
268 // write the prefix of the data
269 auto prefix_sub_range = val_casted | std::ranges::views::take(PREFIX_SIZE);
270 sparrow::ranges::copy(prefix_sub_range, length_ptr + PREFIX_OFFSET);
271
272 // write the buffer index
273 *reinterpret_cast<std::int32_t*>(
274 length_ptr + BUFFER_INDEX_OFFSET
275 ) = static_cast<std::int32_t>(FIRST_VAR_DATA_BUFFER_INDEX);
276
277 // write the buffer offset
278 *reinterpret_cast<std::int32_t*>(
279 length_ptr + BUFFER_OFFSET_OFFSET
280 ) = static_cast<std::int32_t>(long_string_storage_size);
281
282 // count the size of the long string storage
283 long_string_storage_size += length;
284 }
285 ++i;
286 }
287
288 // write the long string storage
289 buffer<uint8_t> long_string_storage(long_string_storage_size);
290 std::size_t long_string_storage_offset = 0;
291 for (auto&& val : range)
292 {
293 const auto length = val.size();
294 if (length > SHORT_STRING_SIZE)
295 {
296 auto val_casted = val
297 | std::ranges::views::transform(
298 [](const auto& v)
299 {
300 return static_cast<std::uint8_t>(v);
301 }
302 );
303
304 sparrow::ranges::copy(val_casted, long_string_storage.data() + long_string_storage_offset);
305 long_string_storage_offset += length;
306 }
307 }
308
309 // For binary or utf-8 view arrays, an extra buffer is appended which stores
310 // the lengths of each variadic data buffer as int64_t.
311 // This buffer is necessary since these buffer lengths are not trivially
312 // extractable from other data in an array of binary or utf-8 view type.
313 u8_buffer<int64_t> buffer_sizes(
314 static_cast<std::size_t>(1),
315 static_cast<int64_t>(long_string_storage_size)
316 );
317
318 return {std::move(length_buffer), std::move(long_string_storage), std::move(buffer_sizes)};
319
320#ifdef __GNUC__
321# pragma GCC diagnostic pop
322#endif
323 }
324
325 template <variable_size_binary_view_impl_types T>
326 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
327 requires std::convertible_to<std::ranges::range_value_t<R>, T>
328 arrow_proxy variable_size_binary_view_array_impl<T>::create_proxy(
329 R&& range,
330 VB&& validity_input,
331 std::optional<std::string_view> name,
332 std::optional<METADATA_RANGE> metadata
333 )
334 {
335 const auto size = range_size(range);
336 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
337 const auto null_count = vbitmap.null_count();
338
340
341 static const std::optional<std::unordered_set<sparrow::ArrowFlag>> flags{{ArrowFlag::NULLABLE}};
342
343 // create arrow schema and array
344 ArrowSchema schema = make_arrow_schema(
345 std::is_same<T, std::string_view>::value ? std::string_view("vu") : std::string_view("vz"),
346 std::move(name), // name
347 std::move(metadata), // metadata
348 flags, // flags
349 nullptr, // children
351 nullptr, // dictionary
352 true
353 );
354
355 // create buffers
356 auto buffers_parts = create_buffers(std::forward<R>(range));
357
358 std::vector<buffer<uint8_t>> buffers{
359 std::move(vbitmap).extract_storage(),
360 std::move(buffers_parts.length_buffer),
361 std::move(buffers_parts.long_string_storage),
362 std::move(buffers_parts.buffer_sizes).extract_storage()
363 };
364
365 // create arrow array
366 ArrowArray arr = make_arrow_array(
367 static_cast<std::int64_t>(size), // length
368 static_cast<int64_t>(null_count),
369 0, // offset
370 std::move(buffers),
371 nullptr, // children
373 nullptr, // dictionary
374 true
375 );
376
377 return arrow_proxy{std::move(arr), std::move(schema)};
378 }
379
380 template <variable_size_binary_view_impl_types T>
381 template <std::ranges::input_range NULLABLE_RANGE, input_metadata_container METADATA_RANGE>
382 requires std::convertible_to<std::ranges::range_value_t<NULLABLE_RANGE>, nullable<T>>
383 [[nodiscard]] arrow_proxy variable_size_binary_view_array_impl<T>::create_proxy(
384 NULLABLE_RANGE&& nullable_range,
385 std::optional<std::string_view> name,
386 std::optional<METADATA_RANGE> metadata
387 )
388 {
389 auto values = nullable_range
390 | std::views::transform(
391 [](const auto& v)
392 {
393 return static_cast<std::string_view>(v.value());
394 }
395 );
396
397 auto is_non_null = nullable_range
398 | std::views::transform(
399 [](const auto& v)
400 {
401 return v.has_value();
402 }
403 );
404
405 return create_proxy(
406 std::forward<decltype(values)>(values),
407 std::forward<decltype(is_non_null)>(is_non_null),
408 name,
409 metadata
410 );
411 }
412
413 template <variable_size_binary_view_impl_types T>
414 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
415 requires std::convertible_to<std::ranges::range_value_t<R>, T>
416 [[nodiscard]] arrow_proxy variable_size_binary_view_array_impl<T>::create_proxy(
417 R&& range,
418 bool nullable,
419 std::optional<std::string_view> name,
420 std::optional<METADATA_RANGE> metadata
421 )
422 {
423 if (nullable)
424 {
425 return create_proxy(std::forward<R>(range), validity_bitmap{}, std::move(name), std::move(metadata));
426 }
427 else
428 {
429 // create arrow schema and array
431 ArrowSchema schema = make_arrow_schema(
432 std::is_same<T, std::string_view>::value ? std::string_view("vu") : std::string_view("vz"),
433 std::move(name), // name
434 std::move(metadata), // metadata
435 std::nullopt, // flags
436 nullptr, // children
438 nullptr, // dictionary
439 true
440 );
441
442 // create buffers
443 auto buffers_parts = create_buffers(std::forward<R>(range));
444
445 std::vector<buffer<uint8_t>> buffers{
446 buffer<uint8_t>{nullptr, 0}, // validity bitmap
447 std::move(buffers_parts.length_buffer),
448 std::move(buffers_parts.long_string_storage),
449 std::move(buffers_parts.buffer_sizes).extract_storage()
450 };
451 const auto size = range_size(range);
452
453 // create arrow array
454 ArrowArray arr = make_arrow_array(
455 static_cast<std::int64_t>(size), // length
456 static_cast<int64_t>(0),
457 0, // offset
458 std::move(buffers),
459 nullptr, // children
461 nullptr, // dictionary
462 true
463 );
464
465 return arrow_proxy{std::move(arr), std::move(schema)};
466 }
467 }
468
469 template <variable_size_binary_view_impl_types T>
470 constexpr auto variable_size_binary_view_array_impl<T>::value(size_type i) -> inner_reference
471 {
472 return static_cast<const self_type*>(this)->value(i);
473 }
474
475 template <variable_size_binary_view_impl_types T>
476 constexpr auto variable_size_binary_view_array_impl<T>::value(size_type i) const -> inner_const_reference
477 {
478#ifdef __GNUC__
479# pragma GCC diagnostic push
480# pragma GCC diagnostic ignored "-Wcast-align"
481#endif
482
483 SPARROW_ASSERT_TRUE(i < this->size());
484
485 constexpr std::size_t element_size = 16;
486 auto data_ptr = this->get_arrow_proxy().buffers()[LENGTH_BUFFER_INDEX].template data<uint8_t>()
487 + (i * element_size);
488
489 auto length = static_cast<std::size_t>(*reinterpret_cast<const std::int32_t*>(data_ptr));
490 using char_or_byte = typename inner_const_reference::value_type;
491
492 if (length <= 12)
493 {
494 constexpr std::ptrdiff_t data_offset = 4;
495 auto ptr = reinterpret_cast<const char_or_byte*>(data_ptr);
496 const auto ret = inner_const_reference(ptr + data_offset, length);
497 return ret;
498 }
499 else
500 {
501 constexpr std::ptrdiff_t buffer_index_offset = 8;
502 constexpr std::ptrdiff_t buffer_offset_offset = 12;
503 auto buffer_index = static_cast<std::size_t>(
504 *reinterpret_cast<const std::int32_t*>(data_ptr + buffer_index_offset)
505 );
506 auto buffer_offset = static_cast<std::size_t>(
507 *reinterpret_cast<const std::int32_t*>(data_ptr + buffer_offset_offset)
508 );
509 auto buffer = this->get_arrow_proxy().buffers()[buffer_index].template data<const char_or_byte>();
510 return inner_const_reference(buffer + buffer_offset, length);
511 }
512
513#ifdef __GNUC__
514# pragma GCC diagnostic pop
515#endif
516 }
517
518 template <variable_size_binary_view_impl_types T>
519 constexpr auto variable_size_binary_view_array_impl<T>::value_begin() -> value_iterator
520 {
522 }
523
524 template <variable_size_binary_view_impl_types T>
525 constexpr auto variable_size_binary_view_array_impl<T>::value_end() -> value_iterator
526 {
527 return value_iterator(detail::layout_value_functor<self_type, inner_value_type>(this), this->size());
528 }
529
530 template <variable_size_binary_view_impl_types T>
531 constexpr auto variable_size_binary_view_array_impl<T>::value_cbegin() const -> const_value_iterator
532 {
533 return const_value_iterator(detail::layout_value_functor<const self_type, inner_value_type>(this), 0);
534 }
535
536 template <variable_size_binary_view_impl_types T>
537 constexpr auto variable_size_binary_view_array_impl<T>::value_cend() const -> const_value_iterator
538 {
539 return const_value_iterator(
541 this->size()
542 );
543 }
544}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::bitmap_iterator bitmap_iterator
typename base_type::iterator_tag iterator_tag
typename base_type::const_bitmap_iterator const_bitmap_iterator
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
Proxy class over ArrowArray and ArrowSchema.
Object that owns a piece of contiguous memory.
Definition buffer.hpp:112
constexpr U * data() noexcept
Definition buffer.hpp:629
constexpr size_type null_count() const noexcept
The nullable class models a value or a reference that can be "null", or missing, like values traditio...
Definition nullable.hpp:278
A view that repeats a value a given number of times.
This buffer class is use as storage buffer for all sparrow arrays.
Definition u8_buffer.hpp:75
nullable< inner_const_reference, bitmap_const_reference > const_reference
#define SPARROW_ASSERT_TRUE(expr__)
constexpr std::size_t size(typelist< T... >={})
Definition mp_utils.hpp:106
constexpr bool excludes_copy_and_move_ctor_v
Definition mp_utils.hpp:574
constexpr std::ranges::copy_result< std::ranges::borrowed_iterator_t< R >, O > copy(R &&r, O result)
Definition ranges.hpp:117
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient typedef to be used as a crtp base class for arrays using a mutable validity buffer.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_variable_size_binary_view_array
Checks whether T is a variable_size_binary_view_array_impl type.
constexpr std::size_t range_size(R &&r)
Definition ranges.hpp:32
variable_size_binary_view_array_impl< std::string_view > string_view_array
A variable-size string view layout implementation.
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
variable_size_binary_view_array_impl< sequence_view< const std::byte > > binary_view_array
A variable-size binary view layout implementation.
functor_index_iterator< detail::layout_value_functor< array_type, inner_value_type > > value_iterator
functor_index_iterator< detail::layout_value_functor< const array_type, inner_reference > > const_value_iterator
Base class for array_inner_types specialization.
Traits class that must be specialized by array classes inheriting from array_crtp_base.
Metafunction for retrieving the data_type of a typed array.