sparrow 0.9.0
Loading...
Searching...
No Matches
variable_size_binary_view_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstddef>
18#include <ranges>
19
34
35namespace sparrow
36{
37
38
39 template <class T>
43
44 namespace detail
45 {
46 template <class T>
47 struct get_data_type_from_array;
48
49 template <>
51 {
52 [[nodiscard]] static constexpr sparrow::data_type get()
53 {
55 }
56 };
57
58 template <>
60 {
61 [[nodiscard]] static constexpr sparrow::data_type get()
62 {
64 }
65 };
66 }
67
68 template <class T>
81
82 template <class T>
84 {
85 };
86
87 template <class T>
91
95 template <class T>
97
98 template <class T>
100 : public mutable_array_bitmap_base<variable_size_binary_view_array_impl<T>>
101 {
102 public:
103
106
108 using inner_value_type = typename inner_types::inner_value_type;
109 using inner_reference = typename inner_types::inner_reference;
110 using inner_const_reference = typename inner_types::inner_const_reference;
111
113 using bitmap_reference = typename base_type::bitmap_reference;
117 using bitmap_range = typename base_type::bitmap_range;
119
123
127
128 using value_iterator = typename base_type::value_iterator;
129 using const_value_iterator = typename base_type::const_value_iterator;
130
131 using iterator = typename base_type::iterator;
132 using const_iterator = typename base_type::const_iterator;
133
135
136 template <class... Args>
139 : variable_size_binary_view_array_impl(create_proxy(std::forward<Args>(args)...))
140 {
141 }
142
143 private:
144
145 struct buffers
146 {
147 buffer<uint8_t> length_buffer;
148 buffer<uint8_t> long_string_storage;
149 u8_buffer<int64_t> buffer_sizes;
150 };
151
152 template <std::ranges::input_range R>
153 requires std::convertible_to<std::ranges::range_value_t<R>, T>
154 static buffers create_buffers(R&& range);
155
156 template <std::ranges::input_range R, validity_bitmap_input VB = validity_bitmap, input_metadata_container METADATA_RANGE>
157 requires std::convertible_to<std::ranges::range_value_t<R>, T>
158 [[nodiscard]] static arrow_proxy create_proxy(
159 R&& range,
160 VB&& bitmap_input = validity_bitmap{},
161 std::optional<std::string_view> name = std::nullopt,
162 std::optional<METADATA_RANGE> metadata = std::nullopt
163 );
164
165 template <std::ranges::input_range NULLABLE_RANGE, input_metadata_container METADATA_RANGE>
166 requires std::convertible_to<std::ranges::range_value_t<NULLABLE_RANGE>, nullable<T>>
167 [[nodiscard]] static arrow_proxy create_proxy(
168 NULLABLE_RANGE&& nullable_range,
169 std::optional<std::string_view> name = std::nullopt,
170 std::optional<METADATA_RANGE> metadata = std::nullopt
171 );
172
173 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
174 requires std::convertible_to<std::ranges::range_value_t<R>, T>
175 [[nodiscard]] static arrow_proxy create_proxy(
176 R&& range,
177 bool = true,
178 std::optional<std::string_view> name = std::nullopt,
179 std::optional<METADATA_RANGE> metadata = std::nullopt
180 );
181
182 [[nodiscard]] inner_reference value(size_type i);
183 [[nodiscard]] inner_const_reference value(size_type i) const;
184
185 [[nodiscard]] value_iterator value_begin();
186 [[nodiscard]] value_iterator value_end();
187
188 [[nodiscard]] const_value_iterator value_cbegin() const;
189 [[nodiscard]] const_value_iterator value_cend() const;
190
191 static constexpr size_type LENGTH_BUFFER_INDEX = 1;
192 static constexpr std::size_t DATA_BUFFER_SIZE = 16;
193 static constexpr std::size_t SHORT_STRING_SIZE = 12;
194 static constexpr std::size_t PREFIX_SIZE = 4;
195 static constexpr std::ptrdiff_t PREFIX_OFFSET = 4;
196 static constexpr std::ptrdiff_t SHORT_STRING_OFFSET = 4;
197 static constexpr std::ptrdiff_t BUFFER_INDEX_OFFSET = 8;
198 static constexpr std::ptrdiff_t BUFFER_OFFSET_OFFSET = 12;
199 static constexpr std::size_t FIRST_VAR_DATA_BUFFER_INDEX = 2;
200
201 friend base_type;
203 };
204
205 template <class T>
210
211 template <class T>
212 template <std::ranges::input_range R>
213 requires std::convertible_to<std::ranges::range_value_t<R>, T>
214 auto variable_size_binary_view_array_impl<T>::create_buffers(R&& range) -> buffers
215 {
216#ifdef __GNUC__
217# pragma GCC diagnostic push
218# pragma GCC diagnostic ignored "-Wcast-align"
219#endif
220
221 const auto size = range_size(range);
222 buffer<uint8_t> length_buffer(size * DATA_BUFFER_SIZE);
223
224 std::size_t long_string_storage_size = 0;
225 std::size_t i = 0;
226 for (auto&& val : range)
227 {
228 auto val_casted = val
229 | std::ranges::views::transform(
230 [](const auto& v)
231 {
232 return static_cast<std::uint8_t>(v);
233 }
234 );
235
236 const auto length = val.size();
237 auto length_ptr = length_buffer.data() + (i * DATA_BUFFER_SIZE);
238
239 // write length
240 *reinterpret_cast<std::int32_t*>(length_ptr) = static_cast<std::int32_t>(length);
241
242 if (length <= SHORT_STRING_SIZE)
243 {
244 // write data itself
245 sparrow::ranges::copy(val_casted, length_ptr + SHORT_STRING_OFFSET);
246 }
247 else
248 {
249 // write the prefix of the data
250 auto prefix_sub_range = val_casted | std::ranges::views::take(PREFIX_SIZE);
251 sparrow::ranges::copy(prefix_sub_range, length_ptr + PREFIX_OFFSET);
252
253 // write the buffer index
254 *reinterpret_cast<std::int32_t*>(
255 length_ptr + BUFFER_INDEX_OFFSET
256 ) = static_cast<std::int32_t>(FIRST_VAR_DATA_BUFFER_INDEX);
257
258 // write the buffer offset
259 *reinterpret_cast<std::int32_t*>(
260 length_ptr + BUFFER_OFFSET_OFFSET
261 ) = static_cast<std::int32_t>(long_string_storage_size);
262
263 // count the size of the long string storage
264 long_string_storage_size += length;
265 }
266 ++i;
267 }
268
269 // write the long string storage
270 buffer<uint8_t> long_string_storage(long_string_storage_size);
271 std::size_t long_string_storage_offset = 0;
272 for (auto&& val : range)
273 {
274 const auto length = val.size();
275 if (length > SHORT_STRING_SIZE)
276 {
277 auto val_casted = val
278 | std::ranges::views::transform(
279 [](const auto& v)
280 {
281 return static_cast<std::uint8_t>(v);
282 }
283 );
284
285 sparrow::ranges::copy(val_casted, long_string_storage.data() + long_string_storage_offset);
286 long_string_storage_offset += length;
287 }
288 }
289
290 // For binary or utf-8 view arrays, an extra buffer is appended which stores
291 // the lengths of each variadic data buffer as int64_t.
292 // This buffer is necessary since these buffer lengths are not trivially
293 // extractable from other data in an array of binary or utf-8 view type.
294 u8_buffer<int64_t> buffer_sizes(
295 static_cast<std::size_t>(1),
296 static_cast<int64_t>(long_string_storage_size)
297 );
298
299 return {std::move(length_buffer), std::move(long_string_storage), std::move(buffer_sizes)};
300
301#ifdef __GNUC__
302# pragma GCC diagnostic pop
303#endif
304 }
305
306 template <class T>
307 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
308 requires std::convertible_to<std::ranges::range_value_t<R>, T>
309 arrow_proxy variable_size_binary_view_array_impl<T>::create_proxy(
310 R&& range,
311 VB&& validity_input,
312 std::optional<std::string_view> name,
313 std::optional<METADATA_RANGE> metadata
314 )
315 {
316 const auto size = range_size(range);
317 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
318 const auto null_count = vbitmap.null_count();
319
321
322 static const std::optional<std::unordered_set<sparrow::ArrowFlag>> flags{{ArrowFlag::NULLABLE}};
323
324 // create arrow schema and array
325 ArrowSchema schema = make_arrow_schema(
326 std::is_same<T, std::string_view>::value ? std::string_view("vu") : std::string_view("vz"),
327 std::move(name), // name
328 std::move(metadata), // metadata
329 flags, // flags
330 nullptr, // children
332 nullptr, // dictionary
333 true
334 );
335
336 // create buffers
337 auto buffers_parts = create_buffers(std::forward<R>(range));
338
339 std::vector<buffer<uint8_t>> buffers{
340 std::move(vbitmap).extract_storage(),
341 std::move(buffers_parts.length_buffer),
342 std::move(buffers_parts.long_string_storage),
343 std::move(buffers_parts.buffer_sizes).extract_storage()
344 };
345
346 // create arrow array
347 ArrowArray arr = make_arrow_array(
348 static_cast<std::int64_t>(size), // length
349 static_cast<int64_t>(null_count),
350 0, // offset
351 std::move(buffers),
352 nullptr, // children
354 nullptr, // dictionary
355 true
356 );
357
358 return arrow_proxy{std::move(arr), std::move(schema)};
359 }
360
361 template <class T>
362 template <std::ranges::input_range NULLABLE_RANGE, input_metadata_container METADATA_RANGE>
363 requires std::convertible_to<std::ranges::range_value_t<NULLABLE_RANGE>, nullable<T>>
364 [[nodiscard]] arrow_proxy variable_size_binary_view_array_impl<T>::create_proxy(
365 NULLABLE_RANGE&& nullable_range,
366 std::optional<std::string_view> name,
367 std::optional<METADATA_RANGE> metadata
368 )
369 {
370 auto values = nullable_range
371 | std::views::transform(
372 [](const auto& v)
373 {
374 return static_cast<std::string_view>(v.value());
375 }
376 );
377
378 auto is_non_null = nullable_range
379 | std::views::transform(
380 [](const auto& v)
381 {
382 return v.has_value();
383 }
384 );
385
386 return create_proxy(
387 std::forward<decltype(values)>(values),
388 std::forward<decltype(is_non_null)>(is_non_null),
389 name,
390 metadata
391 );
392 }
393
394 template <class T>
395 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
396 requires std::convertible_to<std::ranges::range_value_t<R>, T>
397 [[nodiscard]] arrow_proxy variable_size_binary_view_array_impl<T>::create_proxy(
398 R&& range,
399 bool nullable,
400 std::optional<std::string_view> name,
401 std::optional<METADATA_RANGE> metadata
402 )
403 {
404 if (nullable)
405 {
406 return create_proxy(std::forward<R>(range), validity_bitmap{}, std::move(name), std::move(metadata));
407 }
408 else
409 {
410 // create arrow schema and array
412 ArrowSchema schema = make_arrow_schema(
413 std::is_same<T, std::string_view>::value ? std::string_view("vu") : std::string_view("vz"),
414 std::move(name), // name
415 std::move(metadata), // metadata
416 std::nullopt, // flags
417 nullptr, // children
419 nullptr, // dictionary
420 true
421 );
422
423 // create buffers
424 auto buffers_parts = create_buffers(std::forward<R>(range));
425
426 std::vector<buffer<uint8_t>> buffers{
427 buffer<uint8_t>{nullptr, 0}, // validity bitmap
428 std::move(buffers_parts.length_buffer),
429 std::move(buffers_parts.long_string_storage),
430 std::move(buffers_parts.buffer_sizes).extract_storage()
431 };
432 const auto size = range_size(range);
433
434 // create arrow array
435 ArrowArray arr = make_arrow_array(
436 static_cast<std::int64_t>(size), // length
437 static_cast<int64_t>(0),
438 0, // offset
439 std::move(buffers),
440 nullptr, // children
442 nullptr, // dictionary
443 true
444 );
445
446 return arrow_proxy{std::move(arr), std::move(schema)};
447 }
448 }
449
450 template <class T>
451 auto variable_size_binary_view_array_impl<T>::value(size_type i) -> inner_reference
452 {
453 return static_cast<const self_type*>(this)->value(i);
454 }
455
456 template <class T>
457 auto variable_size_binary_view_array_impl<T>::value(size_type i) const -> inner_const_reference
458 {
459#ifdef __GNUC__
460# pragma GCC diagnostic push
461# pragma GCC diagnostic ignored "-Wcast-align"
462#endif
463
464 SPARROW_ASSERT_TRUE(i < this->size());
465
466 constexpr std::size_t element_size = 16;
467 auto data_ptr = this->get_arrow_proxy().buffers()[LENGTH_BUFFER_INDEX].template data<uint8_t>()
468 + (i * element_size);
469
470 auto length = static_cast<std::size_t>(*reinterpret_cast<const std::int32_t*>(data_ptr));
471 using char_or_byte = typename inner_const_reference::value_type;
472
473 if (length <= 12)
474 {
475 constexpr std::ptrdiff_t data_offset = 4;
476 auto ptr = reinterpret_cast<const char_or_byte*>(data_ptr);
477 const auto ret = inner_const_reference(ptr + data_offset, length);
478 return ret;
479 }
480 else
481 {
482 constexpr std::ptrdiff_t buffer_index_offset = 8;
483 constexpr std::ptrdiff_t buffer_offset_offset = 12;
484 auto buffer_index = static_cast<std::size_t>(
485 *reinterpret_cast<const std::int32_t*>(data_ptr + buffer_index_offset)
486 );
487 auto buffer_offset = static_cast<std::size_t>(
488 *reinterpret_cast<const std::int32_t*>(data_ptr + buffer_offset_offset)
489 );
490 auto buffer = this->get_arrow_proxy().buffers()[buffer_index].template data<const char_or_byte>();
491 return inner_const_reference(buffer + buffer_offset, length);
492 }
493
494#ifdef __GNUC__
495# pragma GCC diagnostic pop
496#endif
497 }
498
499 template <class T>
500 auto variable_size_binary_view_array_impl<T>::value_begin() -> value_iterator
501 {
503 }
504
505 template <class T>
506 auto variable_size_binary_view_array_impl<T>::value_end() -> value_iterator
507 {
508 return value_iterator(detail::layout_value_functor<self_type, inner_value_type>(this), this->size());
509 }
510
511 template <class T>
512 auto variable_size_binary_view_array_impl<T>::value_cbegin() const -> const_value_iterator
513 {
514 return const_value_iterator(detail::layout_value_functor<const self_type, inner_value_type>(this), 0);
515 }
516
517 template <class T>
518 auto variable_size_binary_view_array_impl<T>::value_cend() const -> const_value_iterator
519 {
520 return const_value_iterator(
522 this->size()
523 );
524 }
525}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::bitmap_iterator bitmap_iterator
typename base_type::iterator_tag iterator_tag
typename base_type::const_bitmap_iterator const_bitmap_iterator
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
Proxy class over ArrowArray and ArrowSchema.
Object that owns a piece of contiguous memory.
Definition buffer.hpp:112
constexpr U * data() noexcept
Definition buffer.hpp:629
constexpr size_type null_count() const noexcept
The nullable class models a value or a reference that can be "null", or missing, like values traditio...
Definition nullable.hpp:281
A view that repeats a value a given number of times.
This buffer class is use as storage buffer for all sparrow arrays.
Definition u8_buffer.hpp:75
nullable< inner_const_reference, bitmap_const_reference > const_reference
variable_size_binary_view_array_impl< std::string_view > self_type
#define SPARROW_ASSERT_TRUE(expr__)
constexpr std::size_t size(typelist< T... >={})
Definition mp_utils.hpp:107
constexpr bool excludes_copy_and_move_ctor_v
Definition mp_utils.hpp:507
constexpr std::ranges::copy_result< std::ranges::borrowed_iterator_t< R >, O > copy(R &&r, O result)
Definition ranges.hpp:116
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient typedef to be used as a crtp base class for arrays using a mutable validity buffer.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_variable_size_binary_view_array
Checks whether T is a variable_size_binary_view_array_impl type.
variable_size_binary_view_array_impl< std::span< const std::byte > > binary_view_array
variable_size_binary_view_array_impl< std::string_view > string_view_array
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
std::size_t range_size(R &&r)
Definition ranges.hpp:31
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
functor_index_iterator< detail::layout_value_functor< array_type, inner_value_type > > value_iterator
functor_index_iterator< detail::layout_value_functor< const array_type, inner_reference > > const_value_iterator
Base class for array_inner_types specialization.
Traits class that must be specialized by array classes inheriting from array_crtp_base.