sparrow 0.6.0
Loading...
Searching...
No Matches
arrow_array_schema_proxy.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <optional>
20#include <ranges>
21#include <string>
22#include <string_view>
23#include <unordered_set>
24
36
37namespace sparrow
38{
42 class arrow_proxy_exception : public std::runtime_error
43 {
44 public:
45
46 explicit arrow_proxy_exception(const std::string& message)
47 : std::runtime_error(message)
48 {
49 }
50 };
51
57
63
73 {
74 public:
75
86
87 // Copy constructors
90
91 // Move constructors
94
96
97 [[nodiscard]] SPARROW_API const std::string_view format() const;
98
105 SPARROW_API void set_format(const std::string_view format);
106 [[nodiscard]] SPARROW_API enum data_type data_type() const;
107
114 [[nodiscard]] SPARROW_API std::optional<std::string_view> name() const;
115
121 SPARROW_API void set_name(std::optional<std::string_view> name);
122 [[nodiscard]] SPARROW_API std::optional<key_value_view> metadata() const;
123
129 template <input_metadata_container R>
130 void set_metadata(std::optional<R> metadata)
131 {
132 if (!schema_created_with_sparrow())
133 {
134 throw arrow_proxy_exception("Cannot set metadata on non-sparrow created ArrowArray");
135 }
137 if (!metadata.has_value())
138 {
139 private_data->metadata() = std::nullopt;
140 }
141 else
142 {
144 }
145 schema().metadata = private_data->metadata_ptr();
146 }
147
148 [[nodiscard]] SPARROW_API std::unordered_set<ArrowFlag> flags() const;
149
155 SPARROW_API void set_flags(const std::unordered_set<ArrowFlag>& flags);
156 [[nodiscard]] SPARROW_API size_t length() const;
157
166 [[nodiscard]] SPARROW_API int64_t null_count() const;
167
174 [[nodiscard]] SPARROW_API size_t offset() const;
175
182 [[nodiscard]] SPARROW_API size_t n_buffers() const;
183
191 [[nodiscard]] SPARROW_API size_t n_children() const;
192 [[nodiscard]] SPARROW_API const std::vector<sparrow::buffer_view<uint8_t>>& buffers() const;
193 [[nodiscard]] SPARROW_API std::vector<sparrow::buffer_view<uint8_t>>& buffers();
194
203
212
220 SPARROW_API void resize_bitmap(size_t new_size, bool value = true);
221
232 SPARROW_API size_t insert_bitmap(size_t index, bool value, size_t count = 1);
233
243 template <std::ranges::input_range R>
244 size_t insert_bitmap(size_t index, const R& range);
245
255 SPARROW_API size_t erase_bitmap(size_t index, size_t count = 1);
256
264
271
278 template <std::ranges::input_range R>
279 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema_pointers>
281
288 template <std::ranges::input_range R>
289 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema>
290 void add_children(R&& arrow_array_and_schemas);
291
300
309
315 SPARROW_API void pop_children(size_t n);
316
327
339
340 [[nodiscard]] SPARROW_API const std::vector<arrow_proxy>& children() const;
341 [[nodiscard]] SPARROW_API std::vector<arrow_proxy>& children();
342
343 [[nodiscard]] SPARROW_API const std::unique_ptr<arrow_proxy>& dictionary() const;
344 [[nodiscard]] SPARROW_API std::unique_ptr<arrow_proxy>& dictionary();
345
355
366 SPARROW_API void set_dictionary(ArrowArray&& array_dictionary, ArrowSchema&& schema_dictionary);
367
368 [[nodiscard]] SPARROW_API bool is_created_with_sparrow() const;
369
370 [[nodiscard]] SPARROW_API void* private_data() const;
371
375 [[nodiscard]] SPARROW_API arrow_proxy view() const;
376
377 [[nodiscard]] SPARROW_API bool owns_array() const;
379 [[nodiscard]] SPARROW_API ArrowArray& array();
380 [[nodiscard]] SPARROW_API const ArrowArray& array() const;
381
382 [[nodiscard]] SPARROW_API bool owns_schema() const;
385 [[nodiscard]] SPARROW_API const ArrowSchema& schema() const;
386
389
399 [[nodiscard]] SPARROW_API arrow_proxy slice(size_t start, size_t end) const;
400
410 [[nodiscard]] SPARROW_API arrow_proxy slice_view(size_t start, size_t end) const;
411
416
417 private:
418
419 std::variant<ArrowArray*, ArrowArray> m_array;
420 std::variant<ArrowSchema*, ArrowSchema> m_schema;
421 std::vector<sparrow::buffer_view<uint8_t>> m_buffers;
422 std::vector<arrow_proxy> m_children;
423 std::unique_ptr<arrow_proxy> m_dictionary;
424
425 struct impl_tag
426 {
427 };
428
429 // Build an empty proxy. Convenient for resizing vector of children
430 arrow_proxy();
431
432 template <typename AA, typename AS>
433 requires std::same_as<std::remove_pointer_t<std::remove_cvref_t<AA>>, ArrowArray>
434 && std::same_as<std::remove_pointer_t<std::remove_cvref_t<AS>>, ArrowSchema>
435 arrow_proxy(AA&& array, AS&& schema, impl_tag);
436
437 [[nodiscard]] bool empty() const;
438 SPARROW_API void resize_children(size_t children_count);
439
440 [[nodiscard]] SPARROW_API non_owning_dynamic_bitset<uint8_t> get_non_owning_dynamic_bitset();
441
442 void update_children();
443 void update_dictionary();
444 void update_null_count();
445 void reset();
446
447 [[nodiscard]] bool array_created_with_sparrow() const;
448 [[nodiscard]] SPARROW_API bool schema_created_with_sparrow() const;
449
450 void validate_array_and_schema() const;
451
452 [[nodiscard]] bool is_arrow_array_valid() const;
453 [[nodiscard]] bool is_arrow_schema_valid() const;
454 [[nodiscard]] bool is_proxy_valid() const;
455
456 [[nodiscard]] size_t get_null_count() const;
457
458 void swap(arrow_proxy& other) noexcept;
459 };
460
461 template <std::ranges::input_range R>
462 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema_pointers>
464 {
466 {
467 throw arrow_proxy_exception("Cannot set n_buffers on non-sparrow created ArrowArray or ArrowSchema");
468 }
469
470 const size_t add_children_count = std::ranges::size(arrow_array_and_schema_pointers);
471 const size_t original_children_count = n_children();
472 const size_t new_children_count = original_children_count + add_children_count;
473
474 resize_children(new_children_count);
475 for (size_t i = 0; i < add_children_count; ++i)
476 {
477 set_child(
478 i + original_children_count,
481 );
482 }
483 }
484
485 template <std::ranges::input_range R>
486 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema>
487 void arrow_proxy::add_children(R&& arrow_arrays_and_schemas)
488 {
490 {
491 throw arrow_proxy_exception("Cannot set n_buffers on non-sparrow created ArrowArray or ArrowSchema");
492 }
493
494 const size_t add_children_count = std::ranges::size(arrow_arrays_and_schemas);
495 const size_t original_children_count = n_children();
496 const size_t new_children_count = original_children_count + add_children_count;
497
498 resize_children(new_children_count);
499 for (size_t i = 0; i < add_children_count; ++i)
500 {
501 set_child(
502 i + original_children_count,
503 std::move(arrow_arrays_and_schemas[i].array),
504 std::move(arrow_arrays_and_schemas[i].schema)
505 );
506 }
507 }
508
509 template <std::ranges::input_range R>
510 inline size_t arrow_proxy::insert_bitmap(size_t index, const R& range)
511 {
513 {
514 throw arrow_proxy_exception("Cannot modify the bitmap on non-sparrow created ArrowArray");
515 }
517 auto bitmap = get_non_owning_dynamic_bitset();
518 const auto it = bitmap.insert(sparrow::next(bitmap.cbegin(), index), range.begin(), range.end());
519 return static_cast<size_t>(std::distance(bitmap.begin(), it));
520 }
521}
522
523#if defined(__cpp_lib_format)
524
525template <>
526struct std::formatter<sparrow::buffer_view<uint8_t>>
527{
528private:
529
530 char delimiter = ' ';
531 static constexpr std::string_view opening = "[";
532 static constexpr std::string_view closing = "]";
533
534public:
535
536 constexpr auto parse(std::format_parse_context& ctx)
537 {
538 auto it = ctx.begin();
539 auto end = ctx.end();
540
541 // Parse optional delimiter
542 if (it != end && *it != '}')
543 {
544 delimiter = *it++;
545 }
546
547 if (it != end && *it != '}')
548 {
549 throw std::format_error("Invalid format specifier for range");
550 }
551
552 return it;
553 }
554
555 auto format(const sparrow::buffer_view<uint8_t>& range, std::format_context& ctx) const
556 {
557 auto out = ctx.out();
558
559 // Write opening bracket
560 out = sparrow::ranges::copy(opening, out).out;
561
562 // Write range elements
563 bool first = true;
564 for (const auto& elem : range)
565 {
566 if (!first)
567 {
568 *out++ = delimiter;
569 }
570 out = std::format_to(out, "{}", elem);
571 first = false;
572 }
573
574 // Write closing bracket
575 out = sparrow::ranges::copy(closing, out).out;
576
577 return out;
578 }
579};
580
581inline std::ostream& operator<<(std::ostream& os, const sparrow::buffer_view<uint8_t>& value)
582{
583 os << std::format("{}", value);
584 return os;
585}
586
587template <>
588struct std::formatter<sparrow::arrow_proxy>
589{
590 constexpr auto parse(std::format_parse_context& ctx)
591 {
592 return ctx.begin(); // Simple implementation
593 }
594
595 auto format(const sparrow::arrow_proxy& obj, std::format_context& ctx) const
596 {
597 std::string buffers_description_str;
598 for (size_t i = 0; i < obj.n_buffers(); ++i)
599 {
600 std::format_to(
601 std::back_inserter(buffers_description_str),
602 "<{}[{} b]{}",
603 "uint8_t",
604 obj.buffers()[i].size() * sizeof(uint8_t),
605 obj.buffers()[i]
606 );
607 }
608
609 std::string children_str;
610 for (const auto& child : obj.children())
611 {
612 std::format_to(std::back_inserter(children_str), "{}\n", child);
613 }
614
615 const std::string dictionary_str = obj.dictionary() ? std::format("{}", *obj.dictionary()) : "nullptr";
616
617 return std::format_to(
618 ctx.out(),
619 "arrow_proxy\n- format: {}\n- name; {}\n- metadata: {}\n- data_type: {}\n- null_count:{}\n- length: {}\n- offset: {}\n- n_buffers: {}\n- buffers:\n{}\n- n_children: {}\n-children: {}\n- dictionary: {}",
620 obj.format(),
621 obj.name().value_or(""),
622 obj.metadata().value_or(""),
623 obj.data_type(),
624 obj.null_count(),
625 obj.length(),
626 obj.offset(),
627 obj.n_buffers(),
628 buffers_description_str,
629 obj.n_children(),
630 children_str,
631 dictionary_str
632 );
633 }
634};
635
636inline std::ostream& operator<<(std::ostream& os, const sparrow::arrow_proxy& value)
637{
638 os << std::format("{}", value);
639 return os;
640}
641
642#endif
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:39
Private data for ArrowArray.
Exception thrown by the arrow_proxy class.
arrow_proxy_exception(const std::string &message)
Proxy class over ArrowArray and ArrowSchema.
SPARROW_API void push_back_bitmap(bool value)
Push a value at the end of the bitmap buffer.
SPARROW_API ArrowArray extract_array()
SPARROW_API const ArrowSchema & schema() const
SPARROW_API void add_child(ArrowArray *array, ArrowSchema *schema)
Add a child without taking its ownership.
SPARROW_API arrow_proxy & operator=(const arrow_proxy &)
SPARROW_API arrow_proxy(ArrowArray &&array, ArrowSchema *schema)
Constructs an arrow_proxy which takes the ownership of the ArrowArray and uses the provided ArrowSche...
SPARROW_API void set_buffer(size_t index, buffer< uint8_t > &&buffer)
Set the buffer at the given index.
SPARROW_API ArrowSchema & schema()
SPARROW_API const std::unique_ptr< arrow_proxy > & dictionary() const
SPARROW_API std::unordered_set< ArrowFlag > flags() const
void SPARROW_API set_data_type(enum data_type data_type)
Set the data type.
SPARROW_API bool owns_schema() const
SPARROW_API std::vector< arrow_proxy > & children()
SPARROW_API void add_child(ArrowArray &&array, ArrowSchema &&schema)
Add a child and takes its ownership.
SPARROW_API bool is_created_with_sparrow() const
SPARROW_API size_t offset() const
SPARROW_API arrow_proxy & operator=(arrow_proxy &&)
SPARROW_API arrow_proxy(ArrowArray &&array, ArrowSchema &&schema)
Constructs an arrow_proxy which takes the ownership of the ArrowArray and ArrowSchema.
SPARROW_API void set_name(std::optional< std::string_view > name)
Set the name of the ArrowSchema.
SPARROW_API const std::string_view format() const
SPARROW_API size_t length() const
SPARROW_API void set_n_buffers(size_t n_buffers)
Set the number of buffers of the ArrowArray.
SPARROW_API void set_buffer(size_t index, const buffer_view< uint8_t > &buffer)
Set the buffer at the given index.
SPARROW_API void set_child(size_t index, ArrowArray *array, ArrowSchema *schema)
Set the child at the given index.
SPARROW_API void pop_back_bitmap()
Pop a value at the end of the bitmap buffer.
SPARROW_API void set_null_count(int64_t null_count)
Set the null count of the ArrowArray.
SPARROW_API const std::vector< arrow_proxy > & children() const
SPARROW_API std::vector< sparrow::buffer_view< uint8_t > > & buffers()
SPARROW_API enum data_type data_type() const
SPARROW_API arrow_proxy(const arrow_proxy &)
void set_metadata(std::optional< R > metadata)
Set the metadata of the ArrowSchema.
SPARROW_API arrow_proxy(arrow_proxy &&)
SPARROW_API const ArrowArray & array() const
SPARROW_API void set_child(size_t index, ArrowArray &&array, ArrowSchema &&schema)
Set the child at the given index.
SPARROW_API size_t n_children() const
SPARROW_API ArrowArray & array()
SPARROW_API ~arrow_proxy()
SPARROW_API std::optional< std::string_view > name() const
SPARROW_API std::optional< key_value_view > metadata() const
SPARROW_API arrow_array_private_data * get_array_private_data()
SPARROW_API bool owns_array() const
SPARROW_API void set_length(size_t length)
Set the length of the ArrowArray.
SPARROW_API void set_dictionary(ArrowArray &&array_dictionary, ArrowSchema &&schema_dictionary)
Set the dictionary.
SPARROW_API void set_offset(size_t offset)
Set the offset of the ArrowArray.
SPARROW_API void * private_data() const
SPARROW_API void set_format(const std::string_view format)
Set the format according to the Arrow format specification: https://arrow.apache.org/docs/dev/format/...
SPARROW_API arrow_proxy view() const
get a non-owning view of the arrow_proxy.
SPARROW_API arrow_proxy slice_view(size_t start, size_t end) const
Slices the array to keep only the elements between the given start and end.
SPARROW_API void pop_children(size_t n)
Pop n children.
SPARROW_API std::unique_ptr< arrow_proxy > & dictionary()
SPARROW_API void resize_bitmap(size_t new_size, bool value=true)
Resize the bitmap buffer of the ArrowArray.
SPARROW_API size_t n_buffers() const
SPARROW_API size_t erase_bitmap(size_t index, size_t count=1)
Erase several elements in the bitmap buffer at the given index.
SPARROW_API int64_t null_count() const
SPARROW_API arrow_schema_private_data * get_schema_private_data()
SPARROW_API arrow_proxy slice(size_t start, size_t end) const
Slices the array to keep only the elements between the given start and end.
void add_children(const R &arrow_array_and_schema_pointers)
Add children without taking their ownership.
SPARROW_API void set_dictionary(ArrowArray *array, ArrowSchema *schema)
Set the dictionary.It does not take the ownership on the ArrowArray and ArrowSchema passed by pointer...
SPARROW_API const std::vector< sparrow::buffer_view< uint8_t > > & buffers() const
SPARROW_API size_t insert_bitmap(size_t index, bool value, size_t count=1)
Insert elements of the same value in the bitmap buffer at the given index.
SPARROW_API ArrowSchema extract_schema()
SPARROW_API void set_flags(const std::unordered_set< ArrowFlag > &flags)
Set the flags of the ArrowSchema.
SPARROW_API void update_buffers()
Refresh the buffers views.
SPARROW_API arrow_proxy(ArrowArray *array, ArrowSchema *schema)
Constructs an arrow_proxy which uses the provided ArrowArray and ArrowSchema.
Private data for ArrowSchema.
Object that owns a piece of contiguous memory.
Definition buffer.hpp:109
#define SPARROW_API
Definition config.hpp:38
#define SPARROW_ASSERT_TRUE(expr__)
constexpr std::ranges::copy_result< std::ranges::borrowed_iterator_t< R >, O > copy(R &&r, O result)
Definition ranges.hpp:118
SPARROW_API void swap(ArrowArray &lhs, ArrowArray &rhs)
Swaps the contents of the two ArrowArray objects.
constexpr bool has_bitmap(data_type dt)
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
std::string get_metadata_from_key_values(const T &metadata)
Definition metadata.hpp:115
std::ostream & operator<<(std::ostream &os, const sparrow::nullval_t &)
Definition nullable.hpp:900
const char * metadata