sparrow 0.9.0
Loading...
Searching...
No Matches
arrow_array_schema_proxy.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <optional>
20#include <ranges>
21#include <string>
22#include <string_view>
23#include <unordered_set>
24
35
36namespace sparrow
37{
44 class arrow_proxy_exception : public std::runtime_error
45 {
46 public:
47
53 explicit arrow_proxy_exception(const std::string& message)
54 : std::runtime_error(message)
55 {
56 }
57 };
58
64
70
110 {
111 public:
112
126
141
157
170
183
195
208
217
226 [[nodiscard]] SPARROW_API const std::string_view format() const;
227
240 SPARROW_API void set_format(const std::string_view format);
241
249 [[nodiscard]] SPARROW_API enum data_type data_type() const;
250
264
273 [[nodiscard]] SPARROW_API std::optional<std::string_view> name() const;
274
286 SPARROW_API void set_name(std::optional<std::string_view> name);
287
296 [[nodiscard]] SPARROW_API std::optional<key_value_view> metadata() const;
297
311 template <input_metadata_container R>
312 void set_metadata(std::optional<R> metadata)
313 {
314 if (!schema_created_with_sparrow())
315 {
316 throw arrow_proxy_exception("Cannot set metadata on non-sparrow created ArrowArray");
317 }
319 if (!metadata.has_value())
320 {
321 private_data->metadata() = std::nullopt;
322 }
323 else
324 {
326 }
327 schema().metadata = private_data->metadata_ptr();
328 }
329
338 [[nodiscard]] SPARROW_API std::unordered_set<ArrowFlag> flags() const;
339
352 SPARROW_API void set_flags(const std::unordered_set<ArrowFlag>& flags);
353
362 [[nodiscard]] SPARROW_API size_t length() const;
363
380
389 [[nodiscard]] SPARROW_API int64_t null_count() const;
390
407
416 [[nodiscard]] SPARROW_API size_t offset() const;
417
432
441 [[nodiscard]] SPARROW_API size_t n_buffers() const;
442
457
466 [[nodiscard]] SPARROW_API size_t n_children() const;
467
477 [[nodiscard]] SPARROW_API const std::vector<sparrow::buffer_view<uint8_t>>& buffers() const;
478
488 [[nodiscard]] SPARROW_API std::vector<sparrow::buffer_view<uint8_t>>& buffers();
489
507
526
543 SPARROW_API void resize_bitmap(size_t new_size, bool value = true);
544
565 SPARROW_API size_t insert_bitmap(size_t index, bool value, size_t count = 1);
566
589 template <std::ranges::input_range R>
590 size_t insert_bitmap(size_t index, const R& range);
591
612 SPARROW_API size_t erase_bitmap(size_t index, size_t count = 1);
613
628
642
649 template <std::ranges::input_range R>
650 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema_pointers>
652
659 template <std::ranges::input_range R>
660 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema>
661 void add_children(R&& arrow_array_and_schemas);
662
671
680
686 SPARROW_API void pop_children(size_t n);
687
698
710
711
722 [[nodiscard]] SPARROW_API const std::vector<arrow_proxy>& children() const;
723
734 [[nodiscard]] SPARROW_API std::vector<arrow_proxy>& children();
735
746 [[nodiscard]] SPARROW_API const std::unique_ptr<arrow_proxy>& dictionary() const;
747
758 [[nodiscard]] SPARROW_API std::unique_ptr<arrow_proxy>& dictionary();
759
769
779 SPARROW_API void set_dictionary(ArrowArray&& array_dictionary, ArrowSchema&& schema_dictionary);
780
784 [[nodiscard]] SPARROW_API bool is_created_with_sparrow() const;
785
786 [[nodiscard]] SPARROW_API void* private_data() const;
787
791 [[nodiscard]] SPARROW_API arrow_proxy view() const;
792
796 [[nodiscard]] SPARROW_API bool owns_array() const;
797
807
815 [[nodiscard]] SPARROW_API ArrowArray& array();
816
824 [[nodiscard]] SPARROW_API const ArrowArray& array() const;
825
829 [[nodiscard]] SPARROW_API bool owns_schema() const;
830
840
849
857 [[nodiscard]] SPARROW_API const ArrowSchema& schema() const;
858
861
871 [[nodiscard]] SPARROW_API arrow_proxy slice(size_t start, size_t end) const;
872
882 [[nodiscard]] SPARROW_API arrow_proxy slice_view(size_t start, size_t end) const;
883
888
889 private:
890
891 std::variant<ArrowArray*, ArrowArray> m_array;
892 std::variant<ArrowSchema*, ArrowSchema> m_schema;
893 std::vector<sparrow::buffer_view<uint8_t>> m_buffers;
894 std::vector<arrow_proxy> m_children;
895 std::unique_ptr<arrow_proxy> m_dictionary;
896
897 struct impl_tag
898 {
899 };
900
901 // Build an empty proxy. Convenient for resizing vector of children
902 arrow_proxy();
903
904 template <typename AA, typename AS>
905 requires std::same_as<std::remove_pointer_t<std::remove_cvref_t<AA>>, ArrowArray>
906 && std::same_as<std::remove_pointer_t<std::remove_cvref_t<AS>>, ArrowSchema>
907 arrow_proxy(AA&& array, AS&& schema, impl_tag);
908
909 [[nodiscard]] bool empty() const;
910 SPARROW_API void resize_children(size_t children_count);
911
912 [[nodiscard]] SPARROW_API non_owning_dynamic_bitset<uint8_t> get_non_owning_dynamic_bitset();
913
914 void update_children();
915 void update_dictionary();
916 void update_null_count();
917 void reset();
918
919 [[nodiscard]] bool array_created_with_sparrow() const;
920 [[nodiscard]] SPARROW_API bool schema_created_with_sparrow() const;
921
922 void validate_array_and_schema() const;
923
924 [[nodiscard]] bool is_arrow_array_valid() const;
925 [[nodiscard]] bool is_arrow_schema_valid() const;
926 [[nodiscard]] bool is_proxy_valid() const;
927
928 [[nodiscard]] size_t get_null_count() const;
929
930 [[nodiscard]] ArrowArray& array_without_sanitize();
931 [[nodiscard]] const ArrowArray& array_without_sanitize() const;
932
933 [[nodiscard]] ArrowSchema& schema_without_sanitize();
934 [[nodiscard]] const ArrowSchema& schema_without_sanitize() const;
935
940 void sanitize_schema();
941
942 void swap(arrow_proxy& other) noexcept;
943 };
944
945 template <std::ranges::input_range R>
946 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema_pointers>
948 {
950 {
951 throw arrow_proxy_exception("Cannot set n_buffers on non-sparrow created ArrowArray or ArrowSchema");
952 }
953
954 const size_t add_children_count = std::ranges::size(arrow_array_and_schema_pointers);
955 const size_t original_children_count = n_children();
956 const size_t new_children_count = original_children_count + add_children_count;
957
958 resize_children(new_children_count);
959 for (size_t i = 0; i < add_children_count; ++i)
960 {
961 set_child(
962 i + original_children_count,
965 );
966 }
967 }
968
969 template <std::ranges::input_range R>
970 requires std::same_as<std::ranges::range_value_t<R>, arrow_array_and_schema>
971 void arrow_proxy::add_children(R&& arrow_arrays_and_schemas)
972 {
974 {
975 throw arrow_proxy_exception("Cannot set n_buffers on non-sparrow created ArrowArray or ArrowSchema");
976 }
977
978 const size_t add_children_count = std::ranges::size(arrow_arrays_and_schemas);
979 const size_t original_children_count = n_children();
980 const size_t new_children_count = original_children_count + add_children_count;
981
982 resize_children(new_children_count);
983 for (size_t i = 0; i < add_children_count; ++i)
984 {
985 set_child(
986 i + original_children_count,
987 std::move(arrow_arrays_and_schemas[i].array),
988 std::move(arrow_arrays_and_schemas[i].schema)
989 );
990 }
991 }
992
993 template <std::ranges::input_range R>
994 inline size_t arrow_proxy::insert_bitmap(size_t index, const R& range)
995 {
997 {
998 throw arrow_proxy_exception("Cannot modify the bitmap on non-sparrow created ArrowArray");
999 }
1001 auto bitmap = get_non_owning_dynamic_bitset();
1002 const auto it = bitmap.insert(sparrow::next(bitmap.cbegin(), index), range.begin(), range.end());
1003 return static_cast<size_t>(std::distance(bitmap.begin(), it));
1004 }
1005}
1006
1007#if defined(__cpp_lib_format)
1008
1009template <>
1010struct std::formatter<sparrow::buffer_view<uint8_t>>
1011{
1012private:
1013
1014 char delimiter = ' ';
1015 static constexpr std::string_view opening = "[";
1016 static constexpr std::string_view closing = "]";
1017
1018public:
1019
1020 constexpr auto parse(std::format_parse_context& ctx)
1021 {
1022 auto it = ctx.begin();
1023 auto end = ctx.end();
1024
1025 // Parse optional delimiter
1026 if (it != end && *it != '}')
1027 {
1028 delimiter = *it++;
1029 }
1030
1031 if (it != end && *it != '}')
1032 {
1033 throw std::format_error("Invalid format specifier for range");
1034 }
1035
1036 return it;
1037 }
1038
1039 auto format(const sparrow::buffer_view<uint8_t>& range, std::format_context& ctx) const
1040 {
1041 auto out = ctx.out();
1042
1043 // Write opening bracket
1044 out = sparrow::ranges::copy(opening, out).out;
1045
1046 // Write range elements
1047 bool first = true;
1048 for (const auto& elem : range)
1049 {
1050 if (!first)
1051 {
1052 *out++ = delimiter;
1053 }
1054 out = std::format_to(out, "{}", elem);
1055 first = false;
1056 }
1057
1058 // Write closing bracket
1059 out = sparrow::ranges::copy(closing, out).out;
1060
1061 return out;
1062 }
1063};
1064
1065inline std::ostream& operator<<(std::ostream& os, const sparrow::buffer_view<uint8_t>& value)
1066{
1067 os << std::format("{}", value);
1068 return os;
1069}
1070
1071template <>
1072struct std::formatter<sparrow::arrow_proxy>
1073{
1074 constexpr auto parse(std::format_parse_context& ctx)
1075 {
1076 return ctx.begin(); // Simple implementation
1077 }
1078
1079 auto format(const sparrow::arrow_proxy& obj, std::format_context& ctx) const
1080 {
1081 std::string buffers_description_str;
1082 for (size_t i = 0; i < obj.n_buffers(); ++i)
1083 {
1084 std::format_to(
1085 std::back_inserter(buffers_description_str),
1086 "<{}[{} b]{}",
1087 "uint8_t",
1088 obj.buffers()[i].size() * sizeof(uint8_t),
1089 obj.buffers()[i]
1090 );
1091 }
1092
1093 std::string children_str;
1094 for (const auto& child : obj.children())
1095 {
1096 std::format_to(std::back_inserter(children_str), "{}\n", child);
1097 }
1098
1099 const std::string dictionary_str = obj.dictionary() ? std::format("{}", *obj.dictionary()) : "nullptr";
1100
1101 return std::format_to(
1102 ctx.out(),
1103 "arrow_proxy\n- format: {}\n- name; {}\n- metadata: {}\n- data_type: {}\n- null_count:{}\n- length: {}\n- offset: {}\n- n_buffers: {}\n- buffers:\n{}\n- n_children: {}\n-children: {}\n- dictionary: {}",
1104 obj.format(),
1105 obj.name().value_or(""),
1106 obj.metadata().value_or(""),
1107 obj.data_type(),
1108 obj.null_count(),
1109 obj.length(),
1110 obj.offset(),
1111 obj.n_buffers(),
1112 buffers_description_str,
1113 obj.n_children(),
1114 children_str,
1115 dictionary_str
1116 );
1117 }
1118};
1119
1120inline std::ostream& operator<<(std::ostream& os, const sparrow::arrow_proxy& value)
1121{
1122 os << std::format("{}", value);
1123 return os;
1124}
1125
1126#endif
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:39
Private data for ArrowArray.
Exception thrown by arrow_proxy operations.
arrow_proxy_exception(const std::string &message)
Constructs an arrow_proxy_exception with a descriptive message.
SPARROW_API void push_back_bitmap(bool value)
Appends a validity bit at the end of the bitmap.
SPARROW_API ArrowArray extract_array()
Extract the ArrowArray from the proxy, and transfers the responsibility to release it after usage to ...
SPARROW_API const ArrowSchema & schema() const
Get a const reference to the ArrowSchema of the proxy.
SPARROW_API void add_child(ArrowArray *array, ArrowSchema *schema)
Add a child without taking its ownership.
SPARROW_API arrow_proxy(ArrowArray &&array, ArrowSchema *schema)
Constructs an arrow_proxy taking ownership of ArrowArray, referencing ArrowSchema.
SPARROW_API void set_buffer(size_t index, buffer< uint8_t > &&buffer)
Sets a specific buffer by moving it at the given index.
SPARROW_API ArrowSchema & schema()
Get a reference to the ArrowSchema of the proxy.
SPARROW_API const std::unique_ptr< arrow_proxy > & dictionary() const
Returns a constant reference to the dictionary arrow proxy.
SPARROW_API std::unordered_set< ArrowFlag > flags() const
Gets the Arrow flags set for this array.
void SPARROW_API set_data_type(enum data_type data_type)
Sets the data type (updates format string accordingly).
SPARROW_API bool owns_schema() const
Check whether the proxy has ownership of its internal the ArrowSchema.
SPARROW_API std::vector< arrow_proxy > & children()
Returns a mutable reference to the vector of child arrow proxies.
SPARROW_API void add_child(ArrowArray &&array, ArrowSchema &&schema)
Add a child and takes its ownership.
SPARROW_API arrow_proxy & operator=(const arrow_proxy &other)
Copy assignment operator.
SPARROW_API bool is_created_with_sparrow() const
Check if the ArrowArray and ArrowSchema were created with sparrow.
SPARROW_API size_t offset() const
Gets the starting offset within the buffers.
SPARROW_API arrow_proxy(ArrowArray &&array, ArrowSchema &&schema)
Constructs an arrow_proxy taking ownership of both ArrowArray and ArrowSchema.
SPARROW_API void set_name(std::optional< std::string_view > name)
Sets the name of the array/field.
SPARROW_API const std::string_view format() const
Gets the Arrow format string describing the data type.
SPARROW_API size_t length() const
Gets the number of elements in the array.
SPARROW_API void set_n_buffers(size_t n_buffers)
Sets the number of buffers and resizes the buffer vector.
SPARROW_API void set_buffer(size_t index, const buffer_view< uint8_t > &buffer)
Sets a specific buffer at the given index.
SPARROW_API void set_child(size_t index, ArrowArray *array, ArrowSchema *schema)
Set the child at the given index.
SPARROW_API arrow_proxy & operator=(arrow_proxy &&other) noexcept
Move assignment operator.
SPARROW_API void pop_back_bitmap()
Removes the last validity bit from the bitmap.
SPARROW_API void set_null_count(int64_t null_count)
Sets the number of null values in the array.
SPARROW_API const std::vector< arrow_proxy > & children() const
Returns a constant reference to the vector of child arrow proxies.
SPARROW_API std::vector< sparrow::buffer_view< uint8_t > > & buffers()
Gets mutable reference to the buffer views.
SPARROW_API enum data_type data_type() const
Gets the data type enum corresponding to the format.
void set_metadata(std::optional< R > metadata)
Sets the metadata key-value pairs.
SPARROW_API const ArrowArray & array() const
Get a const reference to the ArrowArray of the proxy.
SPARROW_API void set_child(size_t index, ArrowArray &&array, ArrowSchema &&schema)
Set the child at the given index.
SPARROW_API size_t n_children() const
Gets the number of child arrays.
SPARROW_API ArrowArray & array()
Get a reference to the ArrowArray of the proxy.
SPARROW_API ~arrow_proxy()
Destructor releasing owned Arrow structures.
SPARROW_API std::optional< std::string_view > name() const
Gets the optional name of the array/field.
SPARROW_API std::optional< key_value_view > metadata() const
Gets the metadata key-value pairs.
SPARROW_API arrow_array_private_data * get_array_private_data()
SPARROW_API bool owns_array() const
Check whether the proxy has ownership of its internal the ArrowArray.
SPARROW_API void set_length(size_t length)
Sets the number of elements in the array.
SPARROW_API void set_dictionary(ArrowArray &&array_dictionary, ArrowSchema &&schema_dictionary)
Set the dictionary.
SPARROW_API void set_offset(size_t offset)
Sets the starting offset within the buffers.
SPARROW_API void * private_data() const
SPARROW_API void set_format(const std::string_view format)
Sets the Arrow format string.
SPARROW_API arrow_proxy view() const
Get a non-owning view of the arrow_proxy.
SPARROW_API arrow_proxy slice_view(size_t start, size_t end) const
Slices the array to keep only the elements between the given start and end.
SPARROW_API void pop_children(size_t n)
Pop n children.
SPARROW_API std::unique_ptr< arrow_proxy > & dictionary()
Returns a mutable reference to the dictionary arrow proxy.
SPARROW_API void resize_bitmap(size_t new_size, bool value=true)
Resizes the validity bitmap buffer.
SPARROW_API size_t n_buffers() const
Gets the number of buffers in the array.
SPARROW_API size_t erase_bitmap(size_t index, size_t count=1)
Erases validity bits starting at specified position.
SPARROW_API int64_t null_count() const
Gets the number of null values in the array.
SPARROW_API arrow_schema_private_data * get_schema_private_data()
SPARROW_API arrow_proxy(arrow_proxy &&other) noexcept
Move constructor transferring ownership.
SPARROW_API arrow_proxy slice(size_t start, size_t end) const
Slices the array to keep only the elements between the given start and end.
SPARROW_API arrow_proxy(const arrow_proxy &other)
Copy constructor creating independent copy.
void add_children(const R &arrow_array_and_schema_pointers)
Add children without taking their ownership.
SPARROW_API void set_dictionary(ArrowArray *array, ArrowSchema *schema)
Set the dictionary.It does not take the ownership on the ArrowArray and ArrowSchema passed by pointer...
SPARROW_API const std::vector< sparrow::buffer_view< uint8_t > > & buffers() const
Gets const reference to the buffer views.
SPARROW_API size_t insert_bitmap(size_t index, bool value, size_t count=1)
Inserts validity bits with the same value at specified position.
SPARROW_API ArrowSchema extract_schema()
Extract the ArrowSchema from the proxy, and transfers the responsibility to release it after usage to...
SPARROW_API void set_flags(const std::unordered_set< ArrowFlag > &flags)
Sets the Arrow flags for this array.
SPARROW_API void update_buffers()
Refresh the buffers views.
SPARROW_API arrow_proxy(ArrowArray *array, ArrowSchema *schema)
Constructs an arrow_proxy referencing external ArrowArray and ArrowSchema.
Private data for ArrowSchema.
Object that owns a piece of contiguous memory.
Definition buffer.hpp:112
#define SPARROW_API
Definition config.hpp:38
#define SPARROW_ASSERT_TRUE(expr__)
constexpr std::ranges::copy_result< std::ranges::borrowed_iterator_t< R >, O > copy(R &&r, O result)
Definition ranges.hpp:117
SPARROW_API void swap(ArrowArray &lhs, ArrowArray &rhs)
Swaps the contents of the two ArrowArray objects.
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
std::string get_metadata_from_key_values(const T &metadata)
Converts a container of key-value pairs to binary metadata format.
Definition metadata.hpp:323
constexpr bool has_bitmap(data_type dt) noexcept
std::ostream & operator<<(std::ostream &os, const sparrow::nullval_t &)
const char * metadata