sparrow ..
Loading...
Searching...
No Matches
variable_size_binary_view_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstddef>
18#include <ranges>
19
28#include "sparrow/u8_buffer.hpp"
36
37namespace sparrow
38{
39 template <std::ranges::sized_range T, class CR>
41
52
62 arrow_traits<std::vector<byte_t>>::const_reference>;
63
64 namespace detail
65 {
66 template <>
68 {
69 [[nodiscard]] static constexpr sparrow::data_type get()
70 {
72 }
73 };
74
75 template <>
77 {
78 [[nodiscard]] static constexpr sparrow::data_type get()
79 {
81 }
82 };
83 }
84
85 template <std::ranges::sized_range T, class CR>
98
99 template <class T>
101 {
102 };
103
104 template <std::ranges::sized_range T, class CR>
108
112 template <class T>
114
154 template <std::ranges::sized_range T, class CR>
156 : public mutable_array_bitmap_base<variable_size_binary_view_array_impl<T, CR>>
157 {
158 public:
159
162
164 using inner_value_type = typename inner_types::inner_value_type;
165 using inner_reference = typename inner_types::inner_reference;
166 using inner_const_reference = typename inner_types::inner_const_reference;
167
169 using bitmap_reference = typename base_type::bitmap_reference;
173 using bitmap_range = typename base_type::bitmap_range;
175
179
183
184 using value_iterator = typename base_type::value_iterator;
185 using const_value_iterator = typename base_type::const_value_iterator;
186
187 using iterator = typename base_type::iterator;
188 using const_iterator = typename base_type::const_iterator;
189
202
217 template <class... Args>
220 : variable_size_binary_view_array_impl(create_proxy(std::forward<Args>(args)...))
221 {
222 }
223
224 private:
225
234 struct buffers
235 {
236 buffer<uint8_t> length_buffer;
237 buffer<uint8_t> long_string_storage;
238 u8_buffer<int64_t> buffer_sizes;
239 };
240
259 template <std::ranges::input_range R>
260 requires std::convertible_to<std::ranges::range_value_t<R>, T>
261 static buffers create_buffers(R&& range);
262
282 template <std::ranges::input_range R, validity_bitmap_input VB = validity_bitmap, input_metadata_container METADATA_RANGE>
283 requires std::convertible_to<std::ranges::range_value_t<R>, T>
284 [[nodiscard]] static arrow_proxy create_proxy(
285 R&& range,
286 VB&& bitmap_input = validity_bitmap{},
287 std::optional<std::string_view> name = std::nullopt,
288 std::optional<METADATA_RANGE> metadata = std::nullopt
289 );
290
308 template <std::ranges::input_range NULLABLE_RANGE, input_metadata_container METADATA_RANGE>
309 requires std::convertible_to<std::ranges::range_value_t<NULLABLE_RANGE>, nullable<T>>
310 [[nodiscard]] static arrow_proxy create_proxy(
311 NULLABLE_RANGE&& nullable_range,
312 std::optional<std::string_view> name = std::nullopt,
313 std::optional<METADATA_RANGE> metadata = std::nullopt
314 );
315
332 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
333 requires std::convertible_to<std::ranges::range_value_t<R>, T>
334 [[nodiscard]] static arrow_proxy create_proxy(
335 R&& range,
336 bool = true,
337 std::optional<std::string_view> name = std::nullopt,
338 std::optional<METADATA_RANGE> metadata = std::nullopt
339 );
340
360 template <std::ranges::input_range VALUE_BUFFERS_RANGE, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
361 requires std::convertible_to<std::ranges::range_value_t<VALUE_BUFFERS_RANGE>, u8_buffer<uint8_t>>
362 [[nodiscard]] static arrow_proxy create_proxy(
363 size_t element_count,
364 u8_buffer<uint8_t>&& buffer_view,
365 VALUE_BUFFERS_RANGE&& value_buffers,
366 VB&& validity_input,
367 std::optional<std::string_view> name = std::nullopt,
368 std::optional<METADATA_RANGE> metadata = std::nullopt
369 );
370
382 [[nodiscard]] constexpr inner_reference value(size_type i);
383
398 [[nodiscard]] constexpr inner_const_reference value(size_type i) const;
399
407 [[nodiscard]] constexpr value_iterator value_begin();
408
416 [[nodiscard]] constexpr value_iterator value_end();
417
425 [[nodiscard]] constexpr const_value_iterator value_cbegin() const;
426
434 [[nodiscard]] constexpr const_value_iterator value_cend() const;
435
436 static constexpr size_type LENGTH_BUFFER_INDEX = 1;
437 static constexpr std::size_t DATA_BUFFER_SIZE = 16;
438 static constexpr std::size_t SHORT_STRING_SIZE = 12;
439 static constexpr std::size_t PREFIX_SIZE = 4;
440 static constexpr std::ptrdiff_t PREFIX_OFFSET = 4;
441 static constexpr std::ptrdiff_t SHORT_STRING_OFFSET = 4;
442 static constexpr std::ptrdiff_t BUFFER_INDEX_OFFSET = 8;
443 static constexpr std::ptrdiff_t BUFFER_OFFSET_OFFSET = 12;
444 static constexpr std::size_t FIRST_VAR_DATA_BUFFER_INDEX = 2;
445
446 friend base_type;
451 };
452
453 template <std::ranges::sized_range T, class CR>
458
459 template <std::ranges::sized_range T, class CR>
460 template <std::ranges::input_range R>
461 requires std::convertible_to<std::ranges::range_value_t<R>, T>
462 auto variable_size_binary_view_array_impl<T, CR>::create_buffers(R&& range) -> buffers
463 {
464#ifdef __GNUC__
465# pragma GCC diagnostic push
466# pragma GCC diagnostic ignored "-Wcast-align"
467#endif
468
469 const auto size = range_size(range);
470 buffer<uint8_t> length_buffer(size * DATA_BUFFER_SIZE);
471
472 std::size_t long_string_storage_size = 0;
473 std::size_t i = 0;
474 for (auto&& val : range)
475 {
476 auto val_casted = val
477 | std::ranges::views::transform(
478 [](const auto& v)
479 {
480 return static_cast<std::uint8_t>(v);
481 }
482 );
483
484 const auto length = val.size();
485 auto length_ptr = length_buffer.data() + (i * DATA_BUFFER_SIZE);
486
487 // write length
488 *reinterpret_cast<std::int32_t*>(length_ptr) = static_cast<std::int32_t>(length);
489
490 if (length <= SHORT_STRING_SIZE)
491 {
492 // write data itself
493 sparrow::ranges::copy(val_casted, length_ptr + SHORT_STRING_OFFSET);
494 std::fill(
495 length_ptr + SHORT_STRING_OFFSET + length,
496 length_ptr + DATA_BUFFER_SIZE,
497 std::uint8_t(0)
498 );
499 }
500 else
501 {
502 // write the prefix of the data
503 auto prefix_sub_range = val_casted | std::ranges::views::take(PREFIX_SIZE);
504 sparrow::ranges::copy(prefix_sub_range, length_ptr + PREFIX_OFFSET);
505
506 // write the buffer index
507 *reinterpret_cast<std::int32_t*>(length_ptr + BUFFER_INDEX_OFFSET) = 0;
508
509 // write the buffer offset
510 *reinterpret_cast<std::int32_t*>(
511 length_ptr + BUFFER_OFFSET_OFFSET
512 ) = static_cast<std::int32_t>(long_string_storage_size);
513
514 // count the size of the long string storage
515 long_string_storage_size += length;
516 }
517 ++i;
518 }
519
520 // write the long string storage
521 buffer<uint8_t> long_string_storage(long_string_storage_size);
522 std::size_t long_string_storage_offset = 0;
523 for (auto&& val : range)
524 {
525 const auto length = val.size();
526 if (length > SHORT_STRING_SIZE)
527 {
528 auto val_casted = val
529 | std::ranges::views::transform(
530 [](const auto& v)
531 {
532 return static_cast<std::uint8_t>(v);
533 }
534 );
535
536 sparrow::ranges::copy(val_casted, long_string_storage.data() + long_string_storage_offset);
537 long_string_storage_offset += length;
538 }
539 }
540
541 // For binary or utf-8 view arrays, an extra buffer is appended which stores
542 // the lengths of each variadic data buffer as int64_t.
543 // This buffer is necessary since these buffer lengths are not trivially
544 // extractable from other data in an array of binary or utf-8 view type.
545 u8_buffer<int64_t> buffer_sizes(
546 static_cast<std::size_t>(1),
547 static_cast<int64_t>(long_string_storage_size)
548 );
549
550 return {std::move(length_buffer), std::move(long_string_storage), std::move(buffer_sizes)};
551
552#ifdef __GNUC__
553# pragma GCC diagnostic pop
554#endif
555 }
556
557 template <std::ranges::sized_range T, class CR>
558 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
559 requires std::convertible_to<std::ranges::range_value_t<R>, T>
560 arrow_proxy variable_size_binary_view_array_impl<T, CR>::create_proxy(
561 R&& range,
562 VB&& validity_input,
563 std::optional<std::string_view> name,
564 std::optional<METADATA_RANGE> metadata
565 )
566 {
567 const auto size = range_size(range);
568 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
569 const auto null_count = vbitmap.null_count();
570
572
573 static const std::optional<std::unordered_set<sparrow::ArrowFlag>> flags{{ArrowFlag::NULLABLE}};
574
575 // create arrow schema and array
576 ArrowSchema schema = make_arrow_schema(
577 std::is_same<T, arrow_traits<std::string>::value_type>::value ? std::string_view("vu")
578 : std::string_view("vz"),
579 std::move(name), // name
580 std::move(metadata), // metadata
581 flags, // flags
582 nullptr, // children
584 nullptr, // dictionary
585 true
586 );
587
588 // create buffers
589 auto buffers_parts = create_buffers(std::forward<R>(range));
590
591 std::vector<buffer<uint8_t>> buffers{
592 std::move(vbitmap).extract_storage(),
593 std::move(buffers_parts.length_buffer),
594 std::move(buffers_parts.long_string_storage),
595 std::move(buffers_parts.buffer_sizes).extract_storage()
596 };
597
598 // create arrow array
599 ArrowArray arr = make_arrow_array(
600 static_cast<std::int64_t>(size), // length
601 static_cast<int64_t>(null_count),
602 0, // offset
603 std::move(buffers),
604 nullptr, // children
606 nullptr, // dictionary
607 true
608 );
609
610 return arrow_proxy{std::move(arr), std::move(schema)};
611 }
612
613 template <std::ranges::sized_range T, class CR>
614 template <std::ranges::input_range NULLABLE_RANGE, input_metadata_container METADATA_RANGE>
615 requires std::convertible_to<std::ranges::range_value_t<NULLABLE_RANGE>, nullable<T>>
616 [[nodiscard]] arrow_proxy variable_size_binary_view_array_impl<T, CR>::create_proxy(
617 NULLABLE_RANGE&& nullable_range,
618 std::optional<std::string_view> name,
619 std::optional<METADATA_RANGE> metadata
620 )
621 {
622 auto values = nullable_range
623 | std::views::transform(
624 [](const auto& v)
625 {
626 return static_cast<T>(v.value());
627 }
628 );
629
630 auto is_non_null = nullable_range
631 | std::views::transform(
632 [](const auto& v)
633 {
634 return v.has_value();
635 }
636 );
637
638 return create_proxy(
639 std::forward<decltype(values)>(values),
640 std::forward<decltype(is_non_null)>(is_non_null),
641 name,
642 metadata
643 );
644 }
645
646 template <std::ranges::sized_range T, class CR>
647 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
648 requires std::convertible_to<std::ranges::range_value_t<R>, T>
649 [[nodiscard]] arrow_proxy variable_size_binary_view_array_impl<T, CR>::create_proxy(
650 R&& range,
651 bool nullable,
652 std::optional<std::string_view> name,
653 std::optional<METADATA_RANGE> metadata
654 )
655 {
656 if (nullable)
657 {
658 return create_proxy(std::forward<R>(range), validity_bitmap{}, std::move(name), std::move(metadata));
659 }
660 else
661 {
662 // create arrow schema and array
664 ArrowSchema schema = make_arrow_schema(
665 std::is_same<T, arrow_traits<std::string>::value_type>::value ? std::string_view("vu")
666 : std::string_view("vz"),
667 std::move(name), // name
668 std::move(metadata), // metadata
669 std::nullopt, // flags
670 nullptr, // children
672 nullptr, // dictionary
673 true
674 );
675
676 // create buffers
677 auto buffers_parts = create_buffers(std::forward<R>(range));
678
679 std::vector<buffer<uint8_t>> buffers{
680 buffer<uint8_t>{nullptr, 0}, // validity bitmap
681 std::move(buffers_parts.length_buffer),
682 std::move(buffers_parts.long_string_storage),
683 std::move(buffers_parts.buffer_sizes).extract_storage()
684 };
685 const auto size = range_size(range);
686
687 // create arrow array
688 ArrowArray arr = make_arrow_array(
689 static_cast<std::int64_t>(size), // length
690 static_cast<int64_t>(0),
691 0, // offset
692 std::move(buffers),
693 nullptr, // children
695 nullptr, // dictionary
696 true
697 );
698
699 return arrow_proxy{std::move(arr), std::move(schema)};
700 }
701 }
702
703 template <std::ranges::sized_range T, class CR>
704 template <std::ranges::input_range VALUE_BUFFERS_RANGE, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
705 requires std::convertible_to<std::ranges::range_value_t<VALUE_BUFFERS_RANGE>, u8_buffer<uint8_t>>
706 arrow_proxy variable_size_binary_view_array_impl<T, CR>::create_proxy(
707 size_t element_count,
709 VALUE_BUFFERS_RANGE&& value_buffers,
710 VB&& validity_input,
711 std::optional<std::string_view> name,
712 std::optional<METADATA_RANGE> metadata
713 )
714 {
715 const auto size = buffer_view.size() / DATA_BUFFER_SIZE;
716 SPARROW_ASSERT_TRUE(size == element_count);
717
718 constexpr repeat_view<bool> children_ownership(true, 0);
719 static const std::optional<std::unordered_set<sparrow::ArrowFlag>> flags{{ArrowFlag::NULLABLE}};
720
721
722 ArrowSchema schema = make_arrow_schema(
723 std::is_same<T, arrow_traits<std::string>::value_type>::value ? std::string_view("vu")
724 : std::string_view("vz"),
725 std::move(name), // name
726 std::move(metadata), // metadata
727 flags, // flags
728 nullptr, // children
730 nullptr, // dictionary
731 true
732 );
733
734 auto bitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
735 std::vector<buffer<uint8_t>> buffers{std::move(bitmap).extract_storage(), std::move(buffer_view)};
736 for (auto&& buf : value_buffers)
737 {
738 buffers.push_back(std::forward<decltype(buf)>(buf));
739 }
740
741 // Create buffer sizes for the variadic buffers
742 u8_buffer<int64_t> buffer_sizes(value_buffers.size());
743 for (std::size_t i = 0; i < value_buffers.size(); ++i)
744 {
745 buffer_sizes[i] = static_cast<int64_t>(value_buffers[i].size());
746 }
747 buffers.push_back(std::move(buffer_sizes).extract_storage());
748
749 ArrowArray arr = make_arrow_array(
750 static_cast<std::int64_t>(size), // length
751 static_cast<std::int64_t>(bitmap.null_count()), // null_count
752 0, // offset
753 std::move(buffers),
754 nullptr, // children
756 nullptr, // dictionary
757 true
758 );
759
760 return arrow_proxy{std::move(arr), std::move(schema)};
761 }
762
763 template <std::ranges::sized_range T, class CR>
764 constexpr auto variable_size_binary_view_array_impl<T, CR>::value(size_type i) -> inner_reference
765 {
766 return static_cast<const self_type*>(this)->value(i);
767 }
768
769 template <std::ranges::sized_range T, class CR>
770 constexpr auto variable_size_binary_view_array_impl<T, CR>::value(size_type i) const
771 -> inner_const_reference
772 {
773#ifdef __GNUC__
774# pragma GCC diagnostic push
775# pragma GCC diagnostic ignored "-Wcast-align"
776#endif
777
778 SPARROW_ASSERT_TRUE(i < this->size());
779 using char_or_byte = typename inner_const_reference::value_type;
780
781 auto data_ptr = this->get_arrow_proxy().buffers()[LENGTH_BUFFER_INDEX].template data<uint8_t>()
782 + (i * DATA_BUFFER_SIZE);
783 const auto length = static_cast<std::size_t>(*reinterpret_cast<const std::int32_t*>(data_ptr));
784
785 if (length <= SHORT_STRING_SIZE)
786 {
787 constexpr std::ptrdiff_t data_offset = 4;
788 const auto ptr = reinterpret_cast<const char_or_byte*>(data_ptr);
789 const auto ret = inner_const_reference(ptr + data_offset, length);
790 return ret;
791 }
792 else
793 {
794 const auto buffer_index = static_cast<std::size_t>(
795 *reinterpret_cast<const std::int32_t*>(data_ptr + BUFFER_INDEX_OFFSET)
796 );
797 const auto buffer_offset = static_cast<std::size_t>(
798 *reinterpret_cast<const std::int32_t*>(data_ptr + BUFFER_OFFSET_OFFSET)
799 );
800 const auto buffer = this->get_arrow_proxy()
801 .buffers()[buffer_index + FIRST_VAR_DATA_BUFFER_INDEX]
802 .template data<const char_or_byte>();
803 return inner_const_reference(buffer + buffer_offset, length);
804 }
805
806#ifdef __GNUC__
807# pragma GCC diagnostic pop
808#endif
809 }
810
811 template <std::ranges::sized_range T, class CR>
812 constexpr auto variable_size_binary_view_array_impl<T, CR>::value_begin() -> value_iterator
813 {
815 }
816
817 template <std::ranges::sized_range T, class CR>
818 constexpr auto variable_size_binary_view_array_impl<T, CR>::value_end() -> value_iterator
819 {
820 return value_iterator(detail::layout_value_functor<self_type, inner_reference>(this), this->size());
821 }
822
823 template <std::ranges::sized_range T, class CR>
824 constexpr auto variable_size_binary_view_array_impl<T, CR>::value_cbegin() const -> const_value_iterator
825 {
827 }
828
829 template <std::ranges::sized_range T, class CR>
830 constexpr auto variable_size_binary_view_array_impl<T, CR>::value_cend() const -> const_value_iterator
831 {
832 return const_value_iterator(
834 this->size()
835 );
836 }
837}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::bitmap_iterator bitmap_iterator
typename base_type::iterator_tag iterator_tag
typename base_type::const_bitmap_iterator const_bitmap_iterator
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
constexpr size_type size() const noexcept
Object that owns a piece of contiguous memory.
Definition buffer.hpp:113
constexpr U * data() noexcept
Definition buffer.hpp:630
constexpr size_type null_count() const noexcept
Returns the number of bits set to false (null/invalid).
A view that repeats a value a given number of times.
This buffer class is used as storage buffer for all sparrow arrays.
variable_size_binary_view_array_impl(arrow_proxy)
Constructs variable-size binary view array from Arrow proxy.
variable_size_binary_view_array_impl(Args &&... args)
Generic constructor for creating variable-size binary view array.
#define SPARROW_ASSERT_TRUE(expr__)
constexpr std::size_t size(typelist< T... >={})
Gets the count of types contained in a typelist.
Definition mp_utils.hpp:216
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
constexpr std::ranges::copy_result< std::ranges::borrowed_iterator_t< R >, O > copy(R &&r, O result)
Definition ranges.hpp:117
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient alias for arrays with mutable validity bitmaps.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_variable_size_binary_view_array
Checks whether T is a variable_size_binary_view_array_impl type.
constexpr std::size_t range_size(R &&r)
Definition ranges.hpp:32
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
variable_size_binary_view_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference > string_view_array
A variable-size string view layout implementation.
dynamic_bitset< std::uint8_t > validity_bitmap
Type alias for a validity bitmap using 8-bit storage blocks.
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
Ensures a validity bitmap of the specified size from various input types.
variable_size_binary_view_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference > binary_view_array
A variable-size binary view layout implementation.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
functor_index_iterator< detail::layout_value_functor< array_type, inner_reference > > value_iterator
functor_index_iterator< detail::layout_value_functor< const array_type, inner_const_reference > > const_value_iterator
Base class for array_inner_types specializations.
Traits class that must be specialized by array implementations.
Provides compile-time information about Arrow data types.
Metafunction for retrieving the data_type of a typed array.