sparrow 0.6.0
Loading...
Searching...
No Matches
dictionary_encoded_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include "sparrow/array_api.hpp"
31
32namespace sparrow
33{
34 template <class Layout, bool is_const>
36 {
37 public:
38
39 using layout_type = Layout;
40 using storage_type = std::conditional_t<is_const, const layout_type*, layout_type>;
41 using return_type = std::
42 conditional_t<is_const, typename layout_type::const_reference, typename layout_type::reference>;
43
44 constexpr layout_element_functor() = default;
45
46 constexpr explicit layout_element_functor(storage_type layout_)
47 : p_layout(layout_)
48 {
49 }
50
51 [[nodiscard]] return_type operator()(std::size_t i) const
52 {
53 return p_layout->operator[](i);
54 }
55
56 private:
57
58 storage_type p_layout;
59 };
60
61 template <std::integral IT>
62 class dictionary_encoded_array;
63
64 namespace detail
65 {
66 template <class T>
67 struct get_data_type_from_array;
68
69 template <std::integral IT>
71 {
72 [[nodiscard]] static constexpr sparrow::data_type get()
73 {
75 }
76 };
77
78 template <std::integral IT>
80 {
81 [[nodiscard]] static constexpr bool get()
82 {
83 return true;
84 }
85 };
86 }
87
91 template <class T>
93
94 template <std::integral IT>
96 {
97 public:
98
100 using size_type = std::size_t;
101 using difference_type = std::ptrdiff_t;
102
104
108
111
114
116
118
121
124
125 [[nodiscard]] std::optional<std::string_view> name() const;
126 [[nodiscard]] std::optional<key_value_view> metadata() const;
127
128 [[nodiscard]] size_type size() const;
129 [[nodiscard]] bool empty() const;
130
131 [[nodiscard]] const_reference operator[](size_type i) const;
132
133 [[nodiscard]] iterator begin();
134 [[nodiscard]] iterator end();
135
136 [[nodiscard]] const_iterator begin() const;
137 [[nodiscard]] const_iterator end() const;
138
139 [[nodiscard]] const_iterator cbegin() const;
140 [[nodiscard]] const_iterator cend() const;
141
142 [[nodiscard]] const_reference front() const;
143 [[nodiscard]] const_reference back() const;
144
145 template <class... Args>
147 explicit dictionary_encoded_array(Args&&... args)
148 : dictionary_encoded_array(create_proxy(std::forward<Args>(args)...))
149 {
150 }
151
161 [[nodiscard]] self_type slice(size_type start, size_type end) const;
162
172 [[nodiscard]] self_type slice_view(size_type start, size_type end) const;
173
174 private:
175
176 template <
178 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
179 [[nodiscard]] static auto create_proxy(
180 keys_buffer_type&& keys,
181 array&& values,
182 R&& bitmaps = validity_bitmap{},
183 std::optional<std::string_view> name = std::nullopt,
184 std::optional<METADATA_RANGE> metadata = std::nullopt
185 ) -> arrow_proxy;
186
187 using keys_layout = primitive_array<IT>;
188 using values_layout = cloning_ptr<array_wrapper>;
189
190 [[nodiscard]] const inner_value_type& dummy_inner_value() const;
191 // inner_const_reference dummy_inner_const_reference() const;
192 [[nodiscard]] const_reference dummy_const_reference() const;
193
194 [[nodiscard]] static keys_layout create_keys_layout(arrow_proxy& proxy);
195 [[nodiscard]] static values_layout create_values_layout(arrow_proxy& proxy);
196
197 [[nodiscard]] arrow_proxy& get_arrow_proxy();
198 [[nodiscard]] const arrow_proxy& get_arrow_proxy() const;
199
200 arrow_proxy m_proxy;
201 keys_layout m_keys_layout;
202 values_layout p_values_layout;
203
205 };
206
207 template <class IT>
209
210 /*******************************************
211 * dictionary_encoded_array implementation *
212 *******************************************/
213
214 template <std::integral IT>
216 : m_proxy(std::move(proxy))
217 , m_keys_layout(create_keys_layout(m_proxy))
218 , p_values_layout(create_values_layout(m_proxy))
219 {
220 SPARROW_ASSERT_TRUE(data_type_is_integer(m_proxy.data_type()));
221 }
222
223 template <std::integral IT>
225 : m_proxy(rhs.m_proxy)
226 , m_keys_layout(create_keys_layout(m_proxy))
227 , p_values_layout(create_values_layout(m_proxy))
228 {
229 }
230
231 template <std::integral IT>
233 {
234 if (this != &rhs)
235 {
236 m_proxy = rhs.m_proxy;
237 m_keys_layout = create_keys_layout(m_proxy);
238 p_values_layout = create_values_layout(m_proxy);
239 }
240 return *this;
241 }
242
243 template <std::integral IT>
245 : m_proxy(std::move(rhs.m_proxy))
246 , m_keys_layout(create_keys_layout(m_proxy))
247 , p_values_layout(create_values_layout(m_proxy))
248 {
249 }
250
251 template <std::integral IT>
253 {
254 if (this != &rhs)
255 {
256 using std::swap;
257 swap(m_proxy, rhs.m_proxy);
258 m_keys_layout = create_keys_layout(m_proxy);
259 p_values_layout = create_values_layout(m_proxy);
260 }
261 return *this;
262 }
263
264 template <std::integral IT>
265 template <validity_bitmap_input VBI, input_metadata_container METADATA_RANGE>
266 auto dictionary_encoded_array<IT>::create_proxy(
267 keys_buffer_type&& keys,
268 array&& values,
269 VBI&& validity_input,
270 std::optional<std::string_view> name,
271 std::optional<METADATA_RANGE> metadata
272 ) -> arrow_proxy
273 {
274 const auto size = keys.size();
275 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VBI>(validity_input));
276
277 auto [value_array, value_schema] = extract_arrow_structures(std::move(values));
278 const auto null_count = vbitmap.null_count();
279
281
282 // create arrow schema and array
285 std::move(name), // name
286 std::move(metadata), // metadata
287 std::nullopt, // flags
288 nullptr, // children
289 children_ownership, // children_ownership
290 new ArrowSchema(std::move(value_schema)), // dictionary
291 true // dictionary ownership
292 );
293
294 std::vector<buffer<uint8_t>> buffers(2);
295 buffers[0] = std::move(vbitmap).extract_storage();
296 buffers[1] = std::move(keys).extract_storage();
297
298 // create arrow array
300 static_cast<std::int64_t>(size), // length
301 static_cast<int64_t>(null_count),
302 0, // offset
303 std::move(buffers),
304 nullptr, // children
305 children_ownership, // children_ownership
306 new ArrowArray(std::move(value_array)), // dictionary
307 true // dictionary ownership
308 );
309 return arrow_proxy(std::move(arr), std::move(schema));
310 }
311
312 template <std::integral IT>
313 std::optional<std::string_view> dictionary_encoded_array<IT>::name() const
314 {
315 return m_proxy.name();
316 }
317
318 template <std::integral IT>
319 std::optional<key_value_view> dictionary_encoded_array<IT>::metadata() const
320 {
321 return m_proxy.metadata();
322 }
323
324 template <std::integral IT>
326 {
327 return m_proxy.length();
328 }
329
330 template <std::integral IT>
332 {
333 return size() == 0;
334 }
335
336 template <std::integral IT>
338 {
340 const auto index = m_keys_layout[i];
341
342 if (index.has_value())
343 {
344 SPARROW_ASSERT_TRUE(index.value() >= 0);
345 return array_element(*p_values_layout, static_cast<std::size_t>(index.value()));
346 }
347 else
348 {
349 return dummy_const_reference();
350 }
351 }
352
353 template <std::integral IT>
355 {
356 return iterator(functor_type(this), 0u);
357 }
358
359 template <std::integral IT>
361 {
362 return iterator(functor_type(this), size());
363 }
364
365 template <std::integral IT>
367 {
368 return cbegin();
369 }
370
371 template <std::integral IT>
373 {
374 return cend();
375 }
376
377 template <std::integral IT>
382
383 template <std::integral IT>
388
389 template <std::integral IT>
395
396 template <std::integral IT>
398 {
400 return operator[](size() - 1);
401 }
402
403 template <std::integral IT>
404 auto dictionary_encoded_array<IT>::dummy_inner_value() const -> const inner_value_type&
405 {
406 static const inner_value_type instance = array_default_element_value(*p_values_layout);
407 return instance;
408 }
409
410 template <std::integral IT>
412 {
413 SPARROW_ASSERT_TRUE(start <= end);
414 return self_type{get_arrow_proxy().slice(start, end)};
415 }
416
417 template <std::integral IT>
419 {
420 SPARROW_ASSERT_TRUE(start <= end);
421 return self_type{get_arrow_proxy().slice_view(start, end)};
422 }
423
424 /*template <std::integral IT>
425 auto dictionary_encoded_array<IT>::dummy_inner_const_reference() const -> inner_const_reference
426 {
427 static const inner_const_reference instance =
428 std::visit([](const auto& val) -> inner_const_reference { return val; }, dummy_inner_value());
429 return instance;
430 }*/
431
432 template <std::integral IT>
433 auto dictionary_encoded_array<IT>::dummy_const_reference() const -> const_reference
434 {
435 static const const_reference instance = std::visit(
436 [](const auto& val) -> const_reference
437 {
438 using inner_ref = typename arrow_traits<std::decay_t<decltype(val)>>::const_reference;
439 return const_reference{nullable<inner_ref>(inner_ref(val), false)};
440 },
441 dummy_inner_value()
442 );
443 return instance;
444 }
445
446 template <std::integral IT>
447 typename dictionary_encoded_array<IT>::values_layout
448 dictionary_encoded_array<IT>::create_values_layout(arrow_proxy& proxy)
449 {
450 const auto& dictionary = proxy.dictionary();
451 SPARROW_ASSERT_TRUE(dictionary);
452 arrow_proxy ar_dictionary{&(dictionary->array()), &(dictionary->schema())};
453 return array_factory(std::move(ar_dictionary));
454 }
455
456 template <std::integral IT>
457 auto dictionary_encoded_array<IT>::create_keys_layout(arrow_proxy& proxy) -> keys_layout
458 {
459 return keys_layout{arrow_proxy{&proxy.array(), &proxy.schema()}};
460 }
461
462 template <std::integral IT>
463 auto dictionary_encoded_array<IT>::get_arrow_proxy() -> arrow_proxy&
464 {
465 return m_proxy;
466 }
467
468 template <std::integral IT>
469 auto dictionary_encoded_array<IT>::get_arrow_proxy() const -> const arrow_proxy&
470 {
471 return m_proxy;
472 }
473
474 template <class IT>
476 {
477 return std::ranges::equal(lhs, rhs);
478 }
479}
480
481#if defined(__cpp_lib_format)
482template <std::integral IT>
483struct std::formatter<sparrow::dictionary_encoded_array<IT>>
484{
485 constexpr auto parse(std::format_parse_context& ctx)
486 {
487 return ctx.begin(); // Simple implementation
488 }
489
490 auto format(const sparrow::dictionary_encoded_array<IT>& ar, std::format_context& ctx) const
491 {
492 std::format_to(ctx.out(), "Dictionary [size={}] <", ar.size());
493 std::for_each(
494 ar.cbegin(),
495 std::prev(ar.cend()),
496 [&ctx](const auto& value)
497 {
498 std::format_to(ctx.out(), "{}, ", value);
499 }
500 );
501 std::format_to(ctx.out(), "{}>", ar.back());
502 return ctx.out();
503 }
504};
505
506template <std::integral IT>
507std::ostream& operator<<(std::ostream& os, const sparrow::dictionary_encoded_array<IT>& value)
508{
509 os << std::format("{}", value);
510 return os;
511}
512#endif
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:39
Proxy class over ArrowArray and ArrowSchema.
SPARROW_API std::optional< std::string_view > name() const
Smart pointer behaving like a copiable std::unique_ptr.
Definition memory.hpp:127
self_type & operator=(const self_type &)
functor_index_iterator< const_functor_type > const_iterator
self_type slice_view(size_type start, size_type end) const
Slices the array to keep only the elements between the given start and end.
std::optional< key_value_view > metadata() const
layout_element_functor< self_type, true > functor_type
std::optional< std::string_view > name() const
layout_element_functor< self_type, true > const_functor_type
self_type slice(size_type start, size_type end) const
Slices the array to keep only the elements between the given start and end.
functor_index_iterator< functor_type > iterator
const_reference operator[](size_type i) const
constexpr size_type null_count() const noexcept
constexpr layout_element_functor(storage_type layout_)
std:: conditional_t< is_const, typename layout_type::const_reference, typename layout_type::reference > return_type
constexpr layout_element_functor()=default
std::conditional_t< is_const, const layout_type *, layout_type > storage_type
return_type operator()(std::size_t i) const
The nullable class models a value or a reference that can be "null", or missing, like values traditio...
Definition nullable.hpp:280
A view that repeats a value a given number of times.
This buffer class is use as storage buffer for all sparrow arrays.
Definition u8_buffer.hpp:75
#define SPARROW_ASSERT_TRUE(expr__)
#define SPARROW_ASSERT_FALSE(expr__)
constexpr bool excludes_copy_and_move_ctor_v
Definition mp_utils.hpp:507
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr std::string_view data_type_format_of()
SPARROW_API bool operator==(const array &lhs, const array &rhs)
Compares the content of two arrays.
constexpr bool is_dictionary_encoded_array_v
Checks whether T is a dictionary_encoded_array type.
SPARROW_API void swap(ArrowArray &lhs, ArrowArray &rhs)
Swaps the contents of the two ArrowArray objects.
SPARROW_API array_traits::inner_value_type array_default_element_value(const array_wrapper &ar)
std::pair< ArrowArray, ArrowSchema > extract_arrow_structures(A &&a)
Extracts the internal ArrowArrays and ArrowSchema structures from the given array or typed layout.
Definition array.hpp:91
primitive_array_impl< T > primitive_array
Array of values of whose type has fixed binary size.
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
constexpr bool data_type_is_integer(data_type dt)
dynamic_bitset< std::uint8_t > validity_bitmap
SPARROW_API cloning_ptr< array_wrapper > array_factory(arrow_proxy proxy)
SPARROW_API array_traits::const_reference array_element(const array_wrapper &ar, std::size_t index)
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
std::ostream & operator<<(std::ostream &os, const sparrow::nullval_t &)
Definition nullable.hpp:900
mpl::rename< mpl::transform< detail::array_const_reference_t, all_base_types_t >, nullable_variant > const_reference
mpl::rename< all_base_types_t, std::variant > inner_value_type
mpl::rename< mpl::transform< detail::array_value_type_t, all_base_types_t >, nullable_variant > value_type
Provides compile-time information about Arrow data types.