sparrow 0.3.0
Loading...
Searching...
No Matches
arrow_array_schema_proxy.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <optional>
20#include <ranges>
21#include <string>
22#include <string_view>
23
32
33namespace sparrow
34{
38 class arrow_proxy_exception : public std::runtime_error
39 {
40 public:
41
42 explicit arrow_proxy_exception(const std::string& message)
43 : std::runtime_error(message)
44 {
45 }
46 };
47
53
59
69 {
70 public:
71
82
83 // Copy constructors
86
87 // Move constructors
90
92
93 [[nodiscard]] SPARROW_API const std::string_view format() const;
94
101 SPARROW_API void set_format(const std::string_view format);
102 [[nodiscard]] SPARROW_API enum data_type data_type() const;
103
110 [[nodiscard]] SPARROW_API std::optional<std::string_view> name() const;
111
117 SPARROW_API void set_name(std::optional<std::string_view> name);
118 [[nodiscard]] SPARROW_API std::optional<std::string_view> metadata() const;
119
125 SPARROW_API void set_metadata(std::optional<std::string_view> metadata);
126 [[nodiscard]] SPARROW_API std::vector<ArrowFlag> flags() const;
127
133 SPARROW_API void set_flags(const std::vector<ArrowFlag>& flags);
134 [[nodiscard]] SPARROW_API size_t length() const;
135
144 [[nodiscard]] SPARROW_API int64_t null_count() const;
145
152 [[nodiscard]] SPARROW_API size_t offset() const;
153
160 [[nodiscard]] SPARROW_API size_t n_buffers() const;
161
169 [[nodiscard]] SPARROW_API size_t n_children() const;
170 [[nodiscard]] SPARROW_API const std::vector<sparrow::buffer_view<uint8_t>>& buffers() const;
171 [[nodiscard]] SPARROW_API std::vector<sparrow::buffer_view<uint8_t>>& buffers();
172
181
190
198 SPARROW_API void resize_bitmap(size_t new_size, bool value = true);
199
210 SPARROW_API size_t insert_bitmap(size_t index, bool value, size_t count = 1);
211
221 template <std::ranges::input_range R>
222 size_t insert_bitmap(size_t index, const R& range);
223
233 SPARROW_API size_t erase_bitmap(size_t index, size_t count = 1);
234
242
249
256 template <std::ranges::input_range R>
257 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema_pointers>
259
266 template <std::ranges::input_range R>
267 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema>
268 void add_children(R&& arrow_array_and_schemas);
269
278
287
293 SPARROW_API void pop_children(size_t n);
294
305
317
318 [[nodiscard]] SPARROW_API const std::vector<arrow_proxy>& children() const;
319 [[nodiscard]] SPARROW_API std::vector<arrow_proxy>& children();
320
321 [[nodiscard]] SPARROW_API const std::unique_ptr<arrow_proxy>& dictionary() const;
322 [[nodiscard]] SPARROW_API std::unique_ptr<arrow_proxy>& dictionary();
323
333
334 [[nodiscard]] SPARROW_API bool is_created_with_sparrow() const;
335
336 [[nodiscard]] SPARROW_API void* private_data() const;
337
341 [[nodiscard]] SPARROW_API arrow_proxy view() const;
342
343 [[nodiscard]] SPARROW_API bool owns_array() const;
345 [[nodiscard]] SPARROW_API ArrowArray& array();
346 [[nodiscard]] SPARROW_API const ArrowArray& array() const;
347
348 [[nodiscard]] SPARROW_API bool owns_schema() const;
351 [[nodiscard]] SPARROW_API const ArrowSchema& schema() const;
352
355
365 [[nodiscard]] SPARROW_API arrow_proxy slice(size_t start, size_t end) const;
366
376 [[nodiscard]] SPARROW_API arrow_proxy slice_view(size_t start, size_t end) const;
377
382
383 private:
384
385 std::variant<ArrowArray*, ArrowArray> m_array;
386 std::variant<ArrowSchema*, ArrowSchema> m_schema;
387 std::vector<sparrow::buffer_view<uint8_t>> m_buffers;
388 std::vector<arrow_proxy> m_children;
389 std::unique_ptr<arrow_proxy> m_dictionary;
390
391 struct impl_tag
392 {
393 };
394
395 // Build an empty proxy. Convenient for resizing vector of children
396 arrow_proxy();
397
398 template <typename AA, typename AS>
399 requires std::same_as<std::remove_pointer_t<std::remove_cvref_t<AA>>, ArrowArray>
400 && std::same_as<std::remove_pointer_t<std::remove_cvref_t<AS>>, ArrowSchema>
401 arrow_proxy(AA&& array, AS&& schema, impl_tag);
402
403 [[nodiscard]] bool empty() const;
404 SPARROW_API void resize_children(size_t children_count);
405
406 [[nodiscard]] SPARROW_API non_owning_dynamic_bitset<uint8_t> get_non_owning_dynamic_bitset();
407
408 void update_children();
409 void update_dictionary();
410 void update_null_count();
411 void reset();
412
413 [[nodiscard]] bool array_created_with_sparrow() const;
414 [[nodiscard]] bool schema_created_with_sparrow() const;
415
416 void validate_array_and_schema() const;
417
418 [[nodiscard]] bool is_arrow_array_valid() const;
419 [[nodiscard]] bool is_arrow_schema_valid() const;
420 [[nodiscard]] bool is_proxy_valid() const;
421
422 [[nodiscard]] size_t get_null_count() const;
423
424 void swap(arrow_proxy& other) noexcept;
425 };
426
427 template <std::ranges::input_range R>
428 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema_pointers>
430 {
432 {
433 throw arrow_proxy_exception("Cannot set n_buffers on non-sparrow created ArrowArray or ArrowSchema");
434 }
435
436 const size_t add_children_count = std::ranges::size(arrow_array_and_schema_pointers);
437 const size_t original_children_count = n_children();
438 const size_t new_children_count = original_children_count + add_children_count;
439
440 resize_children(new_children_count);
441 for (size_t i = 0; i < add_children_count; ++i)
442 {
443 set_child(
444 i + original_children_count,
447 );
448 }
449 }
450
451 template <std::ranges::input_range R>
452 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema>
453 void arrow_proxy::add_children(R&& arrow_arrays_and_schemas)
454 {
456 {
457 throw arrow_proxy_exception("Cannot set n_buffers on non-sparrow created ArrowArray or ArrowSchema");
458 }
459
460 const size_t add_children_count = std::ranges::size(arrow_arrays_and_schemas);
461 const size_t original_children_count = n_children();
462 const size_t new_children_count = original_children_count + add_children_count;
463
464 resize_children(new_children_count);
465 for (size_t i = 0; i < add_children_count; ++i)
466 {
467 set_child(
468 i + original_children_count,
469 std::move(arrow_arrays_and_schemas[i].array),
470 std::move(arrow_arrays_and_schemas[i].schema)
471 );
472 }
473 }
474
475 template <std::ranges::input_range R>
476 inline size_t arrow_proxy::insert_bitmap(size_t index, const R& range)
477 {
479 {
480 throw arrow_proxy_exception("Cannot modify the bitmap on non-sparrow created ArrowArray");
481 }
483 auto bitmap = get_non_owning_dynamic_bitset();
484 const auto it = bitmap.insert(sparrow::next(bitmap.cbegin(), index), range.begin(), range.end());
485 return static_cast<size_t>(std::distance(bitmap.begin(), it));
486 }
487}
488
489#if defined(__cpp_lib_format)
490
491template <>
492struct std::formatter<sparrow::buffer_view<uint8_t>>
493{
494private:
495
496 char delimiter = ' ';
497 static constexpr std::string_view opening = "[";
498 static constexpr std::string_view closing = "]";
499
500public:
501
502 constexpr auto parse(std::format_parse_context& ctx)
503 {
504 auto it = ctx.begin();
505 auto end = ctx.end();
506
507 // Parse optional delimiter
508 if (it != end && *it != '}')
509 {
510 delimiter = *it++;
511 }
512
513 if (it != end && *it != '}')
514 {
515 throw std::format_error("Invalid format specifier for range");
516 }
517
518 return it;
519 }
520
521 auto format(const sparrow::buffer_view<uint8_t>& range, std::format_context& ctx) const
522 {
523 auto out = ctx.out();
524
525 // Write opening bracket
526 out = std::ranges::copy(opening, out).out;
527
528 // Write range elements
529 bool first = true;
530 for (const auto& elem : range)
531 {
532 if (!first)
533 {
534 *out++ = delimiter;
535 }
536 out = std::format_to(out, "{}", elem);
537 first = false;
538 }
539
540 // Write closing bracket
541 out = std::ranges::copy(closing, out).out;
542
543 return out;
544 }
545};
546
547inline std::ostream& operator<<(std::ostream& os, const sparrow::buffer_view<uint8_t>& value)
548{
549 os << std::format("{}", value);
550 return os;
551}
552
553template <>
554struct std::formatter<sparrow::arrow_proxy>
555{
556 constexpr auto parse(std::format_parse_context& ctx)
557 {
558 return ctx.begin(); // Simple implementation
559 }
560
561 auto format(const sparrow::arrow_proxy& obj, std::format_context& ctx) const
562 {
563 std::string buffers_description_str;
564 for (size_t i = 0; i < obj.n_buffers(); ++i)
565 {
566 std::format_to(
567 std::back_inserter(buffers_description_str),
568 "<{}[{} b]{}",
569 "uint8_t",
570 obj.buffers()[i].size() * sizeof(uint8_t),
571 obj.buffers()[i]
572 );
573 }
574
575 std::string children_str;
576 for (const auto& child : obj.children())
577 {
578 std::format_to(std::back_inserter(children_str), "{}\n", child);
579 }
580
581 const std::string dictionary_str = obj.dictionary() ? std::format("{}", *obj.dictionary()) : "nullptr";
582
583 return std::format_to(
584 ctx.out(),
585 "arrow_proxy\n- format: {}\n- name; {}\n- metadata: {}\n- data_type: {}\n- null_count:{}\n- length: {}\n- offset: {}\n- n_buffers: {}\n- buffers:\n{}\n- n_children: {}\n-children: {}\n- dictionary: {}",
586 obj.format(),
587 obj.name().value_or(""),
588 obj.metadata().value_or(""),
589 obj.data_type(),
590 obj.null_count(),
591 obj.length(),
592 obj.offset(),
593 obj.n_buffers(),
594 buffers_description_str,
595 obj.n_children(),
596 children_str,
597 dictionary_str
598 );
599 }
600};
601
602inline std::ostream& operator<<(std::ostream& os, const sparrow::arrow_proxy& value)
603{
604 os << std::format("{}", value);
605 return os;
606}
607
608#endif
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:39
Private data for ArrowArray.
Exception thrown by the arrow_proxy class.
arrow_proxy_exception(const std::string &message)
Proxy class over ArrowArray and ArrowSchema.
SPARROW_API void push_back_bitmap(bool value)
Push a value at the end of the bitmap buffer.
SPARROW_API ArrowArray extract_array()
SPARROW_API const ArrowSchema & schema() const
SPARROW_API void add_child(ArrowArray *array, ArrowSchema *schema)
Add a child without taking its ownership.
SPARROW_API arrow_proxy & operator=(const arrow_proxy &)
SPARROW_API arrow_proxy(ArrowArray &&array, ArrowSchema *schema)
Constructs an arrow_proxy which takes the ownership of the ArrowArray and uses the provided ArrowSche...
SPARROW_API void set_metadata(std::optional< std::string_view > metadata)
Set the metadata of the ArrowSchema.
SPARROW_API void set_buffer(size_t index, buffer< uint8_t > &&buffer)
Set the buffer at the given index.
SPARROW_API ArrowSchema & schema()
SPARROW_API const std::unique_ptr< arrow_proxy > & dictionary() const
void SPARROW_API set_data_type(enum data_type data_type)
Set the data type.
SPARROW_API bool owns_schema() const
SPARROW_API std::vector< arrow_proxy > & children()
SPARROW_API void add_child(ArrowArray &&array, ArrowSchema &&schema)
Add a child and takes its ownership.
SPARROW_API bool is_created_with_sparrow() const
SPARROW_API size_t offset() const
SPARROW_API std::vector< ArrowFlag > flags() const
SPARROW_API arrow_proxy & operator=(arrow_proxy &&)
SPARROW_API arrow_proxy(ArrowArray &&array, ArrowSchema &&schema)
Constructs an arrow_proxy which takes the ownership of the ArrowArray and ArrowSchema.
SPARROW_API void set_name(std::optional< std::string_view > name)
Set the name of the ArrowSchema.
SPARROW_API const std::string_view format() const
SPARROW_API void set_flags(const std::vector< ArrowFlag > &flags)
Set the flags of the ArrowSchema.
SPARROW_API size_t length() const
SPARROW_API void set_n_buffers(size_t n_buffers)
Set the number of buffers of the ArrowArray.
SPARROW_API void set_buffer(size_t index, const buffer_view< uint8_t > &buffer)
Set the buffer at the given index.
SPARROW_API void set_child(size_t index, ArrowArray *array, ArrowSchema *schema)
Set the child at the given index.
SPARROW_API void pop_back_bitmap()
Pop a value at the end of the bitmap buffer.
SPARROW_API void set_null_count(int64_t null_count)
Set the null count of the ArrowArray.
SPARROW_API const std::vector< arrow_proxy > & children() const
SPARROW_API std::vector< sparrow::buffer_view< uint8_t > > & buffers()
SPARROW_API enum data_type data_type() const
SPARROW_API arrow_proxy(const arrow_proxy &)
SPARROW_API arrow_proxy(arrow_proxy &&)
SPARROW_API const ArrowArray & array() const
SPARROW_API void set_child(size_t index, ArrowArray &&array, ArrowSchema &&schema)
Set the child at the given index.
SPARROW_API std::optional< std::string_view > metadata() const
SPARROW_API size_t n_children() const
SPARROW_API ArrowArray & array()
SPARROW_API ~arrow_proxy()
SPARROW_API std::optional< std::string_view > name() const
SPARROW_API arrow_array_private_data * get_array_private_data()
SPARROW_API bool owns_array() const
SPARROW_API void set_length(size_t length)
Set the length of the ArrowArray.
SPARROW_API void set_offset(size_t offset)
Set the offset of the ArrowArray.
SPARROW_API void * private_data() const
SPARROW_API void set_format(const std::string_view format)
Set the format according to the Arrow format specification: https://arrow.apache.org/docs/dev/format/...
SPARROW_API arrow_proxy view() const
get a non-owning view of the arrow_proxy.
SPARROW_API arrow_proxy slice_view(size_t start, size_t end) const
Slices the array to keep only the elements between the given start and end.
SPARROW_API void pop_children(size_t n)
Pop n children.
SPARROW_API std::unique_ptr< arrow_proxy > & dictionary()
SPARROW_API void resize_bitmap(size_t new_size, bool value=true)
Resize the bitmap buffer of the ArrowArray.
SPARROW_API size_t n_buffers() const
SPARROW_API size_t erase_bitmap(size_t index, size_t count=1)
Erase several elements in the bitmap buffer at the given index.
SPARROW_API int64_t null_count() const
SPARROW_API arrow_schema_private_data * get_schema_private_data()
SPARROW_API arrow_proxy slice(size_t start, size_t end) const
Slices the array to keep only the elements between the given start and end.
void add_children(const R &arrow_array_and_schema_pointers)
Add children without taking their ownership.
SPARROW_API void set_dictionary(ArrowArray *array, ArrowSchema *schema)
Set the dictionary.
SPARROW_API const std::vector< sparrow::buffer_view< uint8_t > > & buffers() const
SPARROW_API size_t insert_bitmap(size_t index, bool value, size_t count=1)
Insert elements of the same value in the bitmap buffer at the given index.
SPARROW_API ArrowSchema extract_schema()
SPARROW_API void update_buffers()
Refresh the buffers views.
SPARROW_API arrow_proxy(ArrowArray *array, ArrowSchema *schema)
Constructs an arrow_proxy which uses the provided ArrowArray and ArrowSchema.
Private data for ArrowSchema.
Object that owns a piece of contiguous memory.
Definition buffer.hpp:109
#define SPARROW_API
Definition config.hpp:38
#define SPARROW_ASSERT_TRUE(expr__)
SPARROW_API void swap(ArrowArray &lhs, ArrowArray &rhs)
Swaps the contents of the two ArrowArray objects.
constexpr bool has_bitmap(data_type dt)
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
std::ostream & operator<<(std::ostream &stream, T n)
Definition large_int.hpp:93