sparrow 0.9.0
Loading...
Searching...
No Matches
arrow_array_schema_proxy.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <optional>
20#include <ranges>
21#include <string>
22#include <string_view>
23#include <unordered_set>
24
35
36namespace sparrow
37{
41 class arrow_proxy_exception : public std::runtime_error
42 {
43 public:
44
45 explicit arrow_proxy_exception(const std::string& message)
46 : std::runtime_error(message)
47 {
48 }
49 };
50
56
62
72 {
73 public:
74
85
86 // Copy constructors
89
90 // Move constructors
92 SPARROW_API arrow_proxy& operator=(arrow_proxy&&) noexcept;
93
95
96 [[nodiscard]] SPARROW_API const std::string_view format() const;
97
104 SPARROW_API void set_format(const std::string_view format);
105 [[nodiscard]] SPARROW_API enum data_type data_type() const;
106
113 [[nodiscard]] SPARROW_API std::optional<std::string_view> name() const;
114
120 SPARROW_API void set_name(std::optional<std::string_view> name);
121 [[nodiscard]] SPARROW_API std::optional<key_value_view> metadata() const;
122
128 template <input_metadata_container R>
129 void set_metadata(std::optional<R> metadata)
130 {
131 if (!schema_created_with_sparrow())
132 {
133 throw arrow_proxy_exception("Cannot set metadata on non-sparrow created ArrowArray");
134 }
136 if (!metadata.has_value())
137 {
138 private_data->metadata() = std::nullopt;
139 }
140 else
141 {
143 }
144 schema().metadata = private_data->metadata_ptr();
145 }
146
147 [[nodiscard]] SPARROW_API std::unordered_set<ArrowFlag> flags() const;
148
154 SPARROW_API void set_flags(const std::unordered_set<ArrowFlag>& flags);
155 [[nodiscard]] SPARROW_API size_t length() const;
156
165 [[nodiscard]] SPARROW_API int64_t null_count() const;
166
173 [[nodiscard]] SPARROW_API size_t offset() const;
174
181 [[nodiscard]] SPARROW_API size_t n_buffers() const;
182
190 [[nodiscard]] SPARROW_API size_t n_children() const;
191 [[nodiscard]] SPARROW_API const std::vector<sparrow::buffer_view<uint8_t>>& buffers() const;
192 [[nodiscard]] SPARROW_API std::vector<sparrow::buffer_view<uint8_t>>& buffers();
193
202
211
219 SPARROW_API void resize_bitmap(size_t new_size, bool value = true);
220
231 SPARROW_API size_t insert_bitmap(size_t index, bool value, size_t count = 1);
232
242 template <std::ranges::input_range R>
243 size_t insert_bitmap(size_t index, const R& range);
244
254 SPARROW_API size_t erase_bitmap(size_t index, size_t count = 1);
255
263
270
277 template <std::ranges::input_range R>
278 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema_pointers>
280
287 template <std::ranges::input_range R>
288 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema>
289 void add_children(R&& arrow_array_and_schemas);
290
299
308
314 SPARROW_API void pop_children(size_t n);
315
326
338
339 [[nodiscard]] SPARROW_API const std::vector<arrow_proxy>& children() const;
340 [[nodiscard]] SPARROW_API std::vector<arrow_proxy>& children();
341
342 [[nodiscard]] SPARROW_API const std::unique_ptr<arrow_proxy>& dictionary() const;
343 [[nodiscard]] SPARROW_API std::unique_ptr<arrow_proxy>& dictionary();
344
354
365 SPARROW_API void set_dictionary(ArrowArray&& array_dictionary, ArrowSchema&& schema_dictionary);
366
370 [[nodiscard]] SPARROW_API bool is_created_with_sparrow() const;
371
372 [[nodiscard]] SPARROW_API void* private_data() const;
373
377 [[nodiscard]] SPARROW_API arrow_proxy view() const;
378
382 [[nodiscard]] SPARROW_API bool owns_array() const;
383
393
401 [[nodiscard]] SPARROW_API ArrowArray& array();
402
410 [[nodiscard]] SPARROW_API const ArrowArray& array() const;
411
415 [[nodiscard]] SPARROW_API bool owns_schema() const;
416
426
435
443 [[nodiscard]] SPARROW_API const ArrowSchema& schema() const;
444
447
457 [[nodiscard]] SPARROW_API arrow_proxy slice(size_t start, size_t end) const;
458
468 [[nodiscard]] SPARROW_API arrow_proxy slice_view(size_t start, size_t end) const;
469
474
475 private:
476
477 std::variant<ArrowArray*, ArrowArray> m_array;
478 std::variant<ArrowSchema*, ArrowSchema> m_schema;
479 std::vector<sparrow::buffer_view<uint8_t>> m_buffers;
480 std::vector<arrow_proxy> m_children;
481 std::unique_ptr<arrow_proxy> m_dictionary;
482
483 struct impl_tag
484 {
485 };
486
487 // Build an empty proxy. Convenient for resizing vector of children
488 arrow_proxy();
489
490 template <typename AA, typename AS>
491 requires std::same_as<std::remove_pointer_t<std::remove_cvref_t<AA>>, ArrowArray>
492 && std::same_as<std::remove_pointer_t<std::remove_cvref_t<AS>>, ArrowSchema>
493 arrow_proxy(AA&& array, AS&& schema, impl_tag);
494
495 [[nodiscard]] bool empty() const;
496 SPARROW_API void resize_children(size_t children_count);
497
498 [[nodiscard]] SPARROW_API non_owning_dynamic_bitset<uint8_t> get_non_owning_dynamic_bitset();
499
500 void update_children();
501 void update_dictionary();
502 void update_null_count();
503 void reset();
504
505 [[nodiscard]] bool array_created_with_sparrow() const;
506 [[nodiscard]] SPARROW_API bool schema_created_with_sparrow() const;
507
508 void validate_array_and_schema() const;
509
510 [[nodiscard]] bool is_arrow_array_valid() const;
511 [[nodiscard]] bool is_arrow_schema_valid() const;
512 [[nodiscard]] bool is_proxy_valid() const;
513
514 [[nodiscard]] size_t get_null_count() const;
515
516 [[nodiscard]] ArrowArray& array_without_sanitize();
517 [[nodiscard]] const ArrowArray& array_without_sanitize() const;
518
519 [[nodiscard]] ArrowSchema& schema_without_sanitize();
520 [[nodiscard]] const ArrowSchema& schema_without_sanitize() const;
521
526 void sanitize_schema();
527
528 void swap(arrow_proxy& other) noexcept;
529 };
530
531 template <std::ranges::input_range R>
532 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema_pointers>
534 {
536 {
537 throw arrow_proxy_exception("Cannot set n_buffers on non-sparrow created ArrowArray or ArrowSchema");
538 }
539
540 const size_t add_children_count = std::ranges::size(arrow_array_and_schema_pointers);
541 const size_t original_children_count = n_children();
542 const size_t new_children_count = original_children_count + add_children_count;
543
544 resize_children(new_children_count);
545 for (size_t i = 0; i < add_children_count; ++i)
546 {
547 set_child(
548 i + original_children_count,
551 );
552 }
553 }
554
555 template <std::ranges::input_range R>
556 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema>
557 void arrow_proxy::add_children(R&& arrow_arrays_and_schemas)
558 {
560 {
561 throw arrow_proxy_exception("Cannot set n_buffers on non-sparrow created ArrowArray or ArrowSchema");
562 }
563
564 const size_t add_children_count = std::ranges::size(arrow_arrays_and_schemas);
565 const size_t original_children_count = n_children();
566 const size_t new_children_count = original_children_count + add_children_count;
567
568 resize_children(new_children_count);
569 for (size_t i = 0; i < add_children_count; ++i)
570 {
571 set_child(
572 i + original_children_count,
573 std::move(arrow_arrays_and_schemas[i].array),
574 std::move(arrow_arrays_and_schemas[i].schema)
575 );
576 }
577 }
578
579 template <std::ranges::input_range R>
580 inline size_t arrow_proxy::insert_bitmap(size_t index, const R& range)
581 {
583 {
584 throw arrow_proxy_exception("Cannot modify the bitmap on non-sparrow created ArrowArray");
585 }
587 auto bitmap = get_non_owning_dynamic_bitset();
588 const auto it = bitmap.insert(sparrow::next(bitmap.cbegin(), index), range.begin(), range.end());
589 return static_cast<size_t>(std::distance(bitmap.begin(), it));
590 }
591}
592
593#if defined(__cpp_lib_format)
594
595template <>
596struct std::formatter<sparrow::buffer_view<uint8_t>>
597{
598private:
599
600 char delimiter = ' ';
601 static constexpr std::string_view opening = "[";
602 static constexpr std::string_view closing = "]";
603
604public:
605
606 constexpr auto parse(std::format_parse_context& ctx)
607 {
608 auto it = ctx.begin();
609 auto end = ctx.end();
610
611 // Parse optional delimiter
612 if (it != end && *it != '}')
613 {
614 delimiter = *it++;
615 }
616
617 if (it != end && *it != '}')
618 {
619 throw std::format_error("Invalid format specifier for range");
620 }
621
622 return it;
623 }
624
625 auto format(const sparrow::buffer_view<uint8_t>& range, std::format_context& ctx) const
626 {
627 auto out = ctx.out();
628
629 // Write opening bracket
630 out = sparrow::ranges::copy(opening, out).out;
631
632 // Write range elements
633 bool first = true;
634 for (const auto& elem : range)
635 {
636 if (!first)
637 {
638 *out++ = delimiter;
639 }
640 out = std::format_to(out, "{}", elem);
641 first = false;
642 }
643
644 // Write closing bracket
645 out = sparrow::ranges::copy(closing, out).out;
646
647 return out;
648 }
649};
650
651inline std::ostream& operator<<(std::ostream& os, const sparrow::buffer_view<uint8_t>& value)
652{
653 os << std::format("{}", value);
654 return os;
655}
656
657template <>
658struct std::formatter<sparrow::arrow_proxy>
659{
660 constexpr auto parse(std::format_parse_context& ctx)
661 {
662 return ctx.begin(); // Simple implementation
663 }
664
665 auto format(const sparrow::arrow_proxy& obj, std::format_context& ctx) const
666 {
667 std::string buffers_description_str;
668 for (size_t i = 0; i < obj.n_buffers(); ++i)
669 {
670 std::format_to(
671 std::back_inserter(buffers_description_str),
672 "<{}[{} b]{}",
673 "uint8_t",
674 obj.buffers()[i].size() * sizeof(uint8_t),
675 obj.buffers()[i]
676 );
677 }
678
679 std::string children_str;
680 for (const auto& child : obj.children())
681 {
682 std::format_to(std::back_inserter(children_str), "{}\n", child);
683 }
684
685 const std::string dictionary_str = obj.dictionary() ? std::format("{}", *obj.dictionary()) : "nullptr";
686
687 return std::format_to(
688 ctx.out(),
689 "arrow_proxy\n- format: {}\n- name; {}\n- metadata: {}\n- data_type: {}\n- null_count:{}\n- length: {}\n- offset: {}\n- n_buffers: {}\n- buffers:\n{}\n- n_children: {}\n-children: {}\n- dictionary: {}",
690 obj.format(),
691 obj.name().value_or(""),
692 obj.metadata().value_or(""),
693 obj.data_type(),
694 obj.null_count(),
695 obj.length(),
696 obj.offset(),
697 obj.n_buffers(),
698 buffers_description_str,
699 obj.n_children(),
700 children_str,
701 dictionary_str
702 );
703 }
704};
705
706inline std::ostream& operator<<(std::ostream& os, const sparrow::arrow_proxy& value)
707{
708 os << std::format("{}", value);
709 return os;
710}
711
712#endif
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:39
Private data for ArrowArray.
Exception thrown by the arrow_proxy class.
arrow_proxy_exception(const std::string &message)
Proxy class over ArrowArray and ArrowSchema.
SPARROW_API void push_back_bitmap(bool value)
Push a value at the end of the bitmap buffer.
SPARROW_API ArrowArray extract_array()
Extract the ArrowArray from the proxy, and transfers the responsibility to release it after usage to ...
SPARROW_API const ArrowSchema & schema() const
Get a const reference to the ArrowSchema of the proxy.
SPARROW_API void add_child(ArrowArray *array, ArrowSchema *schema)
Add a child without taking its ownership.
SPARROW_API arrow_proxy & operator=(const arrow_proxy &)
SPARROW_API arrow_proxy(arrow_proxy &&) noexcept
SPARROW_API arrow_proxy(ArrowArray &&array, ArrowSchema *schema)
Constructs an arrow_proxy which takes the ownership of the ArrowArray and uses the provided ArrowSche...
SPARROW_API void set_buffer(size_t index, buffer< uint8_t > &&buffer)
Set the buffer at the given index.
SPARROW_API ArrowSchema & schema()
Get a reference to the ArrowSchema of the proxy.
SPARROW_API const std::unique_ptr< arrow_proxy > & dictionary() const
SPARROW_API std::unordered_set< ArrowFlag > flags() const
void SPARROW_API set_data_type(enum data_type data_type)
Set the data type.
SPARROW_API bool owns_schema() const
Check whether the proxy has ownership of its internal the ArrowSchema.
SPARROW_API std::vector< arrow_proxy > & children()
SPARROW_API void add_child(ArrowArray &&array, ArrowSchema &&schema)
Add a child and takes its ownership.
SPARROW_API bool is_created_with_sparrow() const
Check if the ArrowArray and ArrowSchema were created with sparrow.
SPARROW_API size_t offset() const
SPARROW_API arrow_proxy(ArrowArray &&array, ArrowSchema &&schema)
Constructs an arrow_proxy which takes the ownership of the ArrowArray and ArrowSchema.
SPARROW_API void set_name(std::optional< std::string_view > name)
Set the name of the ArrowSchema.
SPARROW_API const std::string_view format() const
SPARROW_API size_t length() const
SPARROW_API void set_n_buffers(size_t n_buffers)
Set the number of buffers of the ArrowArray.
SPARROW_API void set_buffer(size_t index, const buffer_view< uint8_t > &buffer)
Set the buffer at the given index.
SPARROW_API void set_child(size_t index, ArrowArray *array, ArrowSchema *schema)
Set the child at the given index.
SPARROW_API void pop_back_bitmap()
Pop a value at the end of the bitmap buffer.
SPARROW_API void set_null_count(int64_t null_count)
Set the null count of the ArrowArray.
SPARROW_API const std::vector< arrow_proxy > & children() const
SPARROW_API std::vector< sparrow::buffer_view< uint8_t > > & buffers()
SPARROW_API enum data_type data_type() const
SPARROW_API arrow_proxy(const arrow_proxy &)
void set_metadata(std::optional< R > metadata)
Set the metadata of the ArrowSchema.
SPARROW_API const ArrowArray & array() const
Get a const reference to the ArrowArray of the proxy.
SPARROW_API void set_child(size_t index, ArrowArray &&array, ArrowSchema &&schema)
Set the child at the given index.
SPARROW_API size_t n_children() const
SPARROW_API ArrowArray & array()
Get a reference to the ArrowArray of the proxy.
SPARROW_API std::optional< std::string_view > name() const
SPARROW_API std::optional< key_value_view > metadata() const
SPARROW_API arrow_array_private_data * get_array_private_data()
SPARROW_API bool owns_array() const
Check whether the proxy has ownership of its internal the ArrowArray.
SPARROW_API void set_length(size_t length)
Set the length of the ArrowArray.
SPARROW_API void set_dictionary(ArrowArray &&array_dictionary, ArrowSchema &&schema_dictionary)
Set the dictionary.
SPARROW_API void set_offset(size_t offset)
Set the offset of the ArrowArray.
SPARROW_API void * private_data() const
SPARROW_API void set_format(const std::string_view format)
Set the format according to the Arrow format specification: https://arrow.apache.org/docs/dev/format/...
SPARROW_API arrow_proxy view() const
Get a non-owning view of the arrow_proxy.
SPARROW_API arrow_proxy slice_view(size_t start, size_t end) const
Slices the array to keep only the elements between the given start and end.
SPARROW_API void pop_children(size_t n)
Pop n children.
SPARROW_API std::unique_ptr< arrow_proxy > & dictionary()
SPARROW_API void resize_bitmap(size_t new_size, bool value=true)
Resize the bitmap buffer of the ArrowArray.
SPARROW_API size_t n_buffers() const
SPARROW_API size_t erase_bitmap(size_t index, size_t count=1)
Erase several elements in the bitmap buffer at the given index.
SPARROW_API int64_t null_count() const
SPARROW_API arrow_schema_private_data * get_schema_private_data()
SPARROW_API arrow_proxy slice(size_t start, size_t end) const
Slices the array to keep only the elements between the given start and end.
void add_children(const R &arrow_array_and_schema_pointers)
Add children without taking their ownership.
SPARROW_API void set_dictionary(ArrowArray *array, ArrowSchema *schema)
Set the dictionary.It does not take the ownership on the ArrowArray and ArrowSchema passed by pointer...
SPARROW_API const std::vector< sparrow::buffer_view< uint8_t > > & buffers() const
SPARROW_API size_t insert_bitmap(size_t index, bool value, size_t count=1)
Insert elements of the same value in the bitmap buffer at the given index.
SPARROW_API ArrowSchema extract_schema()
Extract the ArrowSchema from the proxy, and transfers the responsibility to release it after usage to...
SPARROW_API void set_flags(const std::unordered_set< ArrowFlag > &flags)
Set the flags of the ArrowSchema.
SPARROW_API void update_buffers()
Refresh the buffers views.
SPARROW_API arrow_proxy(ArrowArray *array, ArrowSchema *schema)
Constructs an arrow_proxy which uses the provided ArrowArray and ArrowSchema.
Private data for ArrowSchema.
Object that owns a piece of contiguous memory.
Definition buffer.hpp:112
#define SPARROW_API
Definition config.hpp:38
#define SPARROW_ASSERT_TRUE(expr__)
constexpr std::ranges::copy_result< std::ranges::borrowed_iterator_t< R >, O > copy(R &&r, O result)
Definition ranges.hpp:116
SPARROW_API void swap(ArrowArray &lhs, ArrowArray &rhs)
Swaps the contents of the two ArrowArray objects.
constexpr bool has_bitmap(data_type dt)
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
std::string get_metadata_from_key_values(const T &metadata)
Definition metadata.hpp:109
std::ostream & operator<<(std::ostream &os, const sparrow::nullval_t &)
Definition nullable.hpp:933
const char * metadata