sparrow 1.2.0
Loading...
Searching...
No Matches
variable_size_binary_view_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstddef>
18#include <ranges>
19
28#include "sparrow/u8_buffer.hpp"
36
37namespace sparrow
38{
39 template <std::ranges::sized_range T, class CR>
41
52
62 arrow_traits<std::vector<byte_t>>::const_reference>;
63
64 namespace detail
65 {
66 template <>
68 {
69 [[nodiscard]] static constexpr sparrow::data_type get()
70 {
72 }
73 };
74
75 template <>
77 {
78 [[nodiscard]] static constexpr sparrow::data_type get()
79 {
81 }
82 };
83 }
84
85 template <std::ranges::sized_range T, class CR>
98
99 template <class T>
101 {
102 };
103
104 template <std::ranges::sized_range T, class CR>
108
112 template <class T>
114
154 template <std::ranges::sized_range T, class CR>
156 : public mutable_array_bitmap_base<variable_size_binary_view_array_impl<T, CR>>
157 {
158 public:
159
162
164 using inner_value_type = typename inner_types::inner_value_type;
165 using inner_reference = typename inner_types::inner_reference;
166 using inner_const_reference = typename inner_types::inner_const_reference;
167
169 using bitmap_reference = typename base_type::bitmap_reference;
173 using bitmap_range = typename base_type::bitmap_range;
175
179
183
184 using value_iterator = typename base_type::value_iterator;
185 using const_value_iterator = typename base_type::const_value_iterator;
186
187 using iterator = typename base_type::iterator;
188 using const_iterator = typename base_type::const_iterator;
189
202
217 template <class... Args>
220 : variable_size_binary_view_array_impl(create_proxy(std::forward<Args>(args)...))
221 {
222 }
223
224 private:
225
234 struct buffers
235 {
236 buffer<uint8_t> length_buffer;
237 buffer<uint8_t> long_string_storage;
238 u8_buffer<int64_t> buffer_sizes;
239 };
240
246 [[nodiscard]] static constexpr std::string_view get_arrow_format()
247 {
248 return std::is_same_v<T, arrow_traits<std::string>::value_type> ? std::string_view("vu")
249 : std::string_view("vz");
250 }
251
261 template <input_metadata_container METADATA_RANGE>
262 [[nodiscard]] static ArrowSchema create_arrow_schema(
263 std::optional<std::string_view> name,
264 std::optional<METADATA_RANGE> metadata,
265 std::optional<std::unordered_set<sparrow::ArrowFlag>> flags
266 )
267 {
268 constexpr repeat_view<bool> children_ownership(true, 0);
269 return make_arrow_schema(
270 get_arrow_format(),
271 std::move(name),
272 std::move(metadata),
273 flags,
274 nullptr, // children
275 children_ownership,
276 nullptr, // dictionary
277 true
278 );
279 }
280
299 template <std::ranges::input_range R>
300 requires std::convertible_to<std::ranges::range_value_t<R>, T>
301 static buffers create_buffers(R&& range);
302
322 template <std::ranges::input_range R, validity_bitmap_input VB = validity_bitmap, input_metadata_container METADATA_RANGE>
323 requires std::convertible_to<std::ranges::range_value_t<R>, T>
324 [[nodiscard]] static arrow_proxy create_proxy(
325 R&& range,
326 VB&& bitmap_input = validity_bitmap{},
327 std::optional<std::string_view> name = std::nullopt,
328 std::optional<METADATA_RANGE> metadata = std::nullopt
329 );
330
348 template <std::ranges::input_range NULLABLE_RANGE, input_metadata_container METADATA_RANGE>
349 requires std::convertible_to<std::ranges::range_value_t<NULLABLE_RANGE>, nullable<T>>
350 [[nodiscard]] static arrow_proxy create_proxy(
351 NULLABLE_RANGE&& nullable_range,
352 std::optional<std::string_view> name = std::nullopt,
353 std::optional<METADATA_RANGE> metadata = std::nullopt
354 );
355
372 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
373 requires std::convertible_to<std::ranges::range_value_t<R>, T>
374 [[nodiscard]] static arrow_proxy create_proxy(
375 R&& range,
376 bool = true,
377 std::optional<std::string_view> name = std::nullopt,
378 std::optional<METADATA_RANGE> metadata = std::nullopt
379 );
380
400 template <std::ranges::input_range VALUE_BUFFERS_RANGE, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
401 requires std::convertible_to<std::ranges::range_value_t<VALUE_BUFFERS_RANGE>, u8_buffer<uint8_t>>
402 [[nodiscard]] static arrow_proxy create_proxy(
403 size_t element_count,
404 u8_buffer<uint8_t>&& buffer_view,
405 VALUE_BUFFERS_RANGE&& value_buffers,
406 VB&& validity_input,
407 std::optional<std::string_view> name = std::nullopt,
408 std::optional<METADATA_RANGE> metadata = std::nullopt
409 );
410
422 [[nodiscard]] constexpr inner_reference value(size_type i);
423
438 [[nodiscard]] constexpr inner_const_reference value(size_type i) const;
439
447 [[nodiscard]] constexpr value_iterator value_begin();
448
456 [[nodiscard]] constexpr value_iterator value_end();
457
465 [[nodiscard]] constexpr const_value_iterator value_cbegin() const;
466
474 [[nodiscard]] constexpr const_value_iterator value_cend() const;
475
476 static constexpr size_type LENGTH_BUFFER_INDEX = 1;
477 static constexpr std::size_t DATA_BUFFER_SIZE = 16;
478 static constexpr std::size_t SHORT_STRING_SIZE = 12;
479 static constexpr std::size_t PREFIX_SIZE = 4;
480 static constexpr std::ptrdiff_t PREFIX_OFFSET = 4;
481 static constexpr std::ptrdiff_t SHORT_STRING_OFFSET = 4;
482 static constexpr std::ptrdiff_t BUFFER_INDEX_OFFSET = 8;
483 static constexpr std::ptrdiff_t BUFFER_OFFSET_OFFSET = 12;
484 static constexpr std::size_t FIRST_VAR_DATA_BUFFER_INDEX = 2;
485
486 friend base_type;
491 };
492
493 template <std::ranges::sized_range T, class CR>
498
499 template <std::ranges::sized_range T, class CR>
500 template <std::ranges::input_range R>
501 requires std::convertible_to<std::ranges::range_value_t<R>, T>
502 auto variable_size_binary_view_array_impl<T, CR>::create_buffers(R&& range) -> buffers
503 {
504#ifdef __GNUC__
505# pragma GCC diagnostic push
506# pragma GCC diagnostic ignored "-Wcast-align"
507#endif
508
509 // Helper lambda to cast values to uint8_t
510 auto to_uint8 = [](const auto& v)
511 {
512 return static_cast<std::uint8_t>(v);
513 };
514
515 const auto size = range_size(range);
516 buffer<uint8_t> length_buffer(size * DATA_BUFFER_SIZE);
517
518 std::size_t long_string_storage_size = 0;
519 std::size_t i = 0;
520 for (auto&& val : range)
521 {
522 auto val_casted = val | std::ranges::views::transform(to_uint8);
523
524 const auto length = val.size();
525 auto length_ptr = length_buffer.data() + (i * DATA_BUFFER_SIZE);
526
527 // write length
528 *reinterpret_cast<std::int32_t*>(length_ptr) = static_cast<std::int32_t>(length);
529
530 if (length <= SHORT_STRING_SIZE)
531 {
532 // write data itself
533 sparrow::ranges::copy(val_casted, length_ptr + SHORT_STRING_OFFSET);
534 std::fill(
535 length_ptr + SHORT_STRING_OFFSET + length,
536 length_ptr + DATA_BUFFER_SIZE,
537 std::uint8_t(0)
538 );
539 }
540 else
541 {
542 // write the prefix of the data
543 auto prefix_sub_range = val_casted | std::ranges::views::take(PREFIX_SIZE);
544 sparrow::ranges::copy(prefix_sub_range, length_ptr + PREFIX_OFFSET);
545
546 // write the buffer index
547 *reinterpret_cast<std::int32_t*>(length_ptr + BUFFER_INDEX_OFFSET) = 0;
548
549 // write the buffer offset
550 *reinterpret_cast<std::int32_t*>(
551 length_ptr + BUFFER_OFFSET_OFFSET
552 ) = static_cast<std::int32_t>(long_string_storage_size);
553
554 // count the size of the long string storage
555 long_string_storage_size += length;
556 }
557 ++i;
558 }
559
560 // write the long string storage
561 buffer<uint8_t> long_string_storage(long_string_storage_size);
562 std::size_t long_string_storage_offset = 0;
563 for (auto&& val : range)
564 {
565 const auto length = val.size();
566 if (length > SHORT_STRING_SIZE)
567 {
568 auto val_casted = val | std::ranges::views::transform(to_uint8);
569
570 sparrow::ranges::copy(val_casted, long_string_storage.data() + long_string_storage_offset);
571 long_string_storage_offset += length;
572 }
573 }
574
575 // For binary or utf-8 view arrays, an extra buffer is appended which stores
576 // the lengths of each variadic data buffer as int64_t.
577 // This buffer is necessary since these buffer lengths are not trivially
578 // extractable from other data in an array of binary or utf-8 view type.
579 u8_buffer<int64_t> buffer_sizes(
580 static_cast<std::size_t>(1),
581 static_cast<int64_t>(long_string_storage_size)
582 );
583
584 return {std::move(length_buffer), std::move(long_string_storage), std::move(buffer_sizes)};
585
586#ifdef __GNUC__
587# pragma GCC diagnostic pop
588#endif
589 }
590
591 template <std::ranges::sized_range T, class CR>
592 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
593 requires std::convertible_to<std::ranges::range_value_t<R>, T>
594 arrow_proxy variable_size_binary_view_array_impl<T, CR>::create_proxy(
595 R&& range,
596 VB&& validity_input,
597 std::optional<std::string_view> name,
598 std::optional<METADATA_RANGE> metadata
599 )
600 {
601 const auto size = range_size(range);
602 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
603 const auto null_count = vbitmap.null_count();
604
605 static const std::optional<std::unordered_set<sparrow::ArrowFlag>> flags{{ArrowFlag::NULLABLE}};
606
607 // create arrow schema
608 ArrowSchema schema = create_arrow_schema(std::move(name), std::move(metadata), flags);
609
610 // create buffers
611 auto buffers_parts = create_buffers(std::forward<R>(range));
612
613 std::vector<buffer<uint8_t>> buffers{
614 std::move(vbitmap).extract_storage(),
615 std::move(buffers_parts.length_buffer),
616 std::move(buffers_parts.long_string_storage),
617 std::move(buffers_parts.buffer_sizes).extract_storage()
618 };
619
620 constexpr repeat_view<bool> children_ownership(true, 0);
621
622 // create arrow array
623 ArrowArray arr = make_arrow_array(
624 static_cast<std::int64_t>(size), // length
625 static_cast<int64_t>(null_count),
626 0, // offset
627 std::move(buffers),
628 nullptr, // children
630 nullptr, // dictionary
631 true
632 );
633
634 return arrow_proxy{std::move(arr), std::move(schema)};
635 }
636
637 template <std::ranges::sized_range T, class CR>
638 template <std::ranges::input_range NULLABLE_RANGE, input_metadata_container METADATA_RANGE>
639 requires std::convertible_to<std::ranges::range_value_t<NULLABLE_RANGE>, nullable<T>>
640 [[nodiscard]] arrow_proxy variable_size_binary_view_array_impl<T, CR>::create_proxy(
641 NULLABLE_RANGE&& nullable_range,
642 std::optional<std::string_view> name,
643 std::optional<METADATA_RANGE> metadata
644 )
645 {
646 auto values = nullable_range
647 | std::views::transform(
648 [](const auto& v)
649 {
650 return static_cast<T>(v.value());
651 }
652 );
653
654 auto is_non_null = nullable_range
655 | std::views::transform(
656 [](const auto& v)
657 {
658 return v.has_value();
659 }
660 );
661
662 return create_proxy(
663 std::forward<decltype(values)>(values),
664 std::forward<decltype(is_non_null)>(is_non_null),
665 name,
666 metadata
667 );
668 }
669
670 template <std::ranges::sized_range T, class CR>
671 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
672 requires std::convertible_to<std::ranges::range_value_t<R>, T>
673 [[nodiscard]] arrow_proxy variable_size_binary_view_array_impl<T, CR>::create_proxy(
674 R&& range,
675 bool nullable,
676 std::optional<std::string_view> name,
677 std::optional<METADATA_RANGE> metadata
678 )
679 {
680 if (nullable)
681 {
682 return create_proxy(std::forward<R>(range), validity_bitmap{}, std::move(name), std::move(metadata));
683 }
684
685 // create arrow schema
686 ArrowSchema schema = create_arrow_schema(std::move(name), std::move(metadata), std::nullopt);
687
688 // create buffers
689 auto buffers_parts = create_buffers(std::forward<R>(range));
690
691 std::vector<buffer<uint8_t>> buffers{
692 buffer<uint8_t>{nullptr, 0}, // validity bitmap
693 std::move(buffers_parts.length_buffer),
694 std::move(buffers_parts.long_string_storage),
695 std::move(buffers_parts.buffer_sizes).extract_storage()
696 };
697 const auto size = range_size(range);
698
699 constexpr repeat_view<bool> children_ownership(true, 0);
700
701 // create arrow array
702 ArrowArray arr = make_arrow_array(
703 static_cast<std::int64_t>(size), // length
704 static_cast<int64_t>(0),
705 0, // offset
706 std::move(buffers),
707 nullptr, // children
709 nullptr, // dictionary
710 true
711 );
712
713 return arrow_proxy{std::move(arr), std::move(schema)};
714 }
715
716 template <std::ranges::sized_range T, class CR>
717 template <std::ranges::input_range VALUE_BUFFERS_RANGE, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
718 requires std::convertible_to<std::ranges::range_value_t<VALUE_BUFFERS_RANGE>, u8_buffer<uint8_t>>
719 arrow_proxy variable_size_binary_view_array_impl<T, CR>::create_proxy(
720 size_t element_count,
722 VALUE_BUFFERS_RANGE&& value_buffers,
723 VB&& validity_input,
724 std::optional<std::string_view> name,
725 std::optional<METADATA_RANGE> metadata
726 )
727 {
728 const auto size = buffer_view.size() / DATA_BUFFER_SIZE;
729 SPARROW_ASSERT_TRUE(size == element_count);
730
731 static const std::optional<std::unordered_set<sparrow::ArrowFlag>> flags{{ArrowFlag::NULLABLE}};
732
733 ArrowSchema schema = create_arrow_schema(std::move(name), std::move(metadata), flags);
734
735 auto bitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
736 std::vector<buffer<uint8_t>> buffers{std::move(bitmap).extract_storage(), std::move(buffer_view)};
737 for (auto&& buf : value_buffers)
738 {
739 buffers.push_back(std::forward<decltype(buf)>(buf));
740 }
741
742 // Create buffer sizes for the variadic buffers
743 u8_buffer<int64_t> buffer_sizes(value_buffers.size());
744 for (std::size_t i = 0; i < value_buffers.size(); ++i)
745 {
746 buffer_sizes[i] = static_cast<int64_t>(value_buffers[i].size());
747 }
748 buffers.push_back(std::move(buffer_sizes).extract_storage());
749
750 constexpr repeat_view<bool> children_ownership(true, 0);
751
752 ArrowArray arr = make_arrow_array(
753 static_cast<std::int64_t>(size), // length
754 static_cast<std::int64_t>(bitmap.null_count()), // null_count
755 0, // offset
756 std::move(buffers),
757 nullptr, // children
759 nullptr, // dictionary
760 true
761 );
762
763 return arrow_proxy{std::move(arr), std::move(schema)};
764 }
765
766 template <std::ranges::sized_range T, class CR>
767 constexpr auto variable_size_binary_view_array_impl<T, CR>::value(size_type i) -> inner_reference
768 {
769 return static_cast<const self_type*>(this)->value(i);
770 }
771
772 template <std::ranges::sized_range T, class CR>
773 constexpr auto variable_size_binary_view_array_impl<T, CR>::value(size_type i) const
774 -> inner_const_reference
775 {
776#ifdef __GNUC__
777# pragma GCC diagnostic push
778# pragma GCC diagnostic ignored "-Wcast-align"
779#endif
780
781 SPARROW_ASSERT_TRUE(i < this->size());
782 using char_or_byte = typename inner_const_reference::value_type;
783
784 auto data_ptr = this->get_arrow_proxy().buffers()[LENGTH_BUFFER_INDEX].template data<uint8_t>()
785 + (i * DATA_BUFFER_SIZE);
786 const auto length = static_cast<std::size_t>(*reinterpret_cast<const std::int32_t*>(data_ptr));
787
788 if (length <= SHORT_STRING_SIZE)
789 {
790 constexpr std::ptrdiff_t data_offset = 4;
791 const auto ptr = reinterpret_cast<const char_or_byte*>(data_ptr);
792 const auto ret = inner_const_reference(ptr + data_offset, length);
793 return ret;
794 }
795 else
796 {
797 const auto buffer_index = static_cast<std::size_t>(
798 *reinterpret_cast<const std::int32_t*>(data_ptr + BUFFER_INDEX_OFFSET)
799 );
800 const auto buffer_offset = static_cast<std::size_t>(
801 *reinterpret_cast<const std::int32_t*>(data_ptr + BUFFER_OFFSET_OFFSET)
802 );
803 const auto buffer = this->get_arrow_proxy()
804 .buffers()[buffer_index + FIRST_VAR_DATA_BUFFER_INDEX]
805 .template data<const char_or_byte>();
806 return inner_const_reference(buffer + buffer_offset, length);
807 }
808
809#ifdef __GNUC__
810# pragma GCC diagnostic pop
811#endif
812 }
813
814 template <std::ranges::sized_range T, class CR>
815 constexpr auto variable_size_binary_view_array_impl<T, CR>::value_begin() -> value_iterator
816 {
818 }
819
820 template <std::ranges::sized_range T, class CR>
821 constexpr auto variable_size_binary_view_array_impl<T, CR>::value_end() -> value_iterator
822 {
823 return value_iterator(detail::layout_value_functor<self_type, inner_reference>(this), this->size());
824 }
825
826 template <std::ranges::sized_range T, class CR>
827 constexpr auto variable_size_binary_view_array_impl<T, CR>::value_cbegin() const -> const_value_iterator
828 {
830 }
831
832 template <std::ranges::sized_range T, class CR>
833 constexpr auto variable_size_binary_view_array_impl<T, CR>::value_cend() const -> const_value_iterator
834 {
835 return const_value_iterator(
837 this->size()
838 );
839 }
840}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::bitmap_iterator bitmap_iterator
typename base_type::iterator_tag iterator_tag
typename base_type::const_bitmap_iterator const_bitmap_iterator
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
constexpr size_type size() const noexcept
Object that owns a piece of contiguous memory.
Definition buffer.hpp:113
constexpr U * data() noexcept
Definition buffer.hpp:630
constexpr size_type null_count() const noexcept
Returns the number of bits set to false (null/invalid).
A view that repeats a value a given number of times.
This buffer class is used as storage buffer for all sparrow arrays.
variable_size_binary_view_array_impl(arrow_proxy)
Constructs variable-size binary view array from Arrow proxy.
variable_size_binary_view_array_impl(Args &&... args)
Generic constructor for creating variable-size binary view array.
#define SPARROW_ASSERT_TRUE(expr__)
constexpr std::size_t size(typelist< T... >={})
Gets the count of types contained in a typelist.
Definition mp_utils.hpp:216
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
constexpr std::ranges::copy_result< std::ranges::borrowed_iterator_t< R >, O > copy(R &&r, O result)
Definition ranges.hpp:117
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient alias for arrays with mutable validity bitmaps.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_variable_size_binary_view_array
Checks whether T is a variable_size_binary_view_array_impl type.
constexpr std::size_t range_size(R &&r)
Definition ranges.hpp:32
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
variable_size_binary_view_array_impl< arrow_traits< std::string >::value_type, arrow_traits< std::string >::const_reference > string_view_array
A variable-size string view layout implementation.
dynamic_bitset< std::uint8_t > validity_bitmap
Type alias for a validity bitmap using 8-bit storage blocks.
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
Ensures a validity bitmap of the specified size from various input types.
variable_size_binary_view_array_impl< arrow_traits< std::vector< byte_t > >::value_type, arrow_traits< std::vector< byte_t > >::const_reference > binary_view_array
A variable-size binary view layout implementation.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
functor_index_iterator< detail::layout_value_functor< array_type, inner_reference > > value_iterator
functor_index_iterator< detail::layout_value_functor< const array_type, inner_const_reference > > const_value_iterator
Base class for array_inner_types specializations.
Traits class that must be specialized by array implementations.
Provides compile-time information about Arrow data types.
Metafunction for retrieving the data_type of a typed array.