sparrow 0.3.0
Loading...
Searching...
No Matches
dictionary_encoded_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include "sparrow/array_api.hpp"
30
31namespace sparrow
32{
33 template <class Layout, bool is_const>
35 {
36 public:
37
38 using layout_type = Layout;
39 using storage_type = std::conditional_t<is_const, const layout_type*, layout_type>;
40 using return_type = std::
41 conditional_t<is_const, typename layout_type::const_reference, typename layout_type::reference>;
42
43 constexpr layout_element_functor() = default;
44
45 constexpr explicit layout_element_functor(storage_type layout_)
46 : p_layout(layout_)
47 {
48 }
49
50 [[nodiscard]] return_type operator()(std::size_t i) const
51 {
52 return p_layout->operator[](i);
53 }
54
55 private:
56
57 storage_type p_layout;
58 };
59
60 template <std::integral IT>
61 class dictionary_encoded_array;
62
63 namespace detail
64 {
65 template <class T>
66 struct get_data_type_from_array;
67
68 template <std::integral IT>
70 {
71 [[nodiscard]] static constexpr sparrow::data_type get()
72 {
74 }
75 };
76
77 template <std::integral IT>
79 {
80 [[nodiscard]] static constexpr bool get()
81 {
82 return true;
83 }
84 };
85 }
86
90 template <class T>
92
93 template <std::integral IT>
95 {
96 public:
97
99 using size_type = std::size_t;
100 using difference_type = std::ptrdiff_t;
101
103
107
110
113
115
117
120
123
124 [[nodiscard]] std::optional<std::string_view> name() const;
125 [[nodiscard]] std::optional<std::string_view> metadata() const;
126
127 [[nodiscard]] size_type size() const;
128 [[nodiscard]] bool empty() const;
129
130 [[nodiscard]] const_reference operator[](size_type i) const;
131
132 [[nodiscard]] iterator begin();
133 [[nodiscard]] iterator end();
134
135 [[nodiscard]] const_iterator begin() const;
136 [[nodiscard]] const_iterator end() const;
137
138 [[nodiscard]] const_iterator cbegin() const;
139 [[nodiscard]] const_iterator cend() const;
140
141 [[nodiscard]] const_reference front() const;
142 [[nodiscard]] const_reference back() const;
143
144 template <class... Args>
146 explicit dictionary_encoded_array(Args&&... args)
147 : dictionary_encoded_array(create_proxy(std::forward<Args>(args)...))
148 {
149 }
150
160 [[nodiscard]] self_type slice(size_type start, size_type end) const;
161
171 [[nodiscard]] self_type slice_view(size_type start, size_type end) const;
172
173 private:
174
175 template <validity_bitmap_input R = validity_bitmap>
176 [[nodiscard]] static auto create_proxy(
177 keys_buffer_type&& keys,
178 array&& values,
179 R&& bitmaps = validity_bitmap{},
180 std::optional<std::string_view> name = std::nullopt,
181 std::optional<std::string_view> metadata = std::nullopt
182 ) -> arrow_proxy;
183
184 using keys_layout = primitive_array<IT>;
185 using values_layout = cloning_ptr<array_wrapper>;
186
187 [[nodiscard]] const inner_value_type& dummy_inner_value() const;
188 // inner_const_reference dummy_inner_const_reference() const;
189 [[nodiscard]] const_reference dummy_const_reference() const;
190
191 [[nodiscard]] static keys_layout create_keys_layout(arrow_proxy& proxy);
192 [[nodiscard]] static values_layout create_values_layout(arrow_proxy& proxy);
193
194 [[nodiscard]] arrow_proxy& get_arrow_proxy();
195 [[nodiscard]] const arrow_proxy& get_arrow_proxy() const;
196
197 arrow_proxy m_proxy;
198 keys_layout m_keys_layout;
199 values_layout p_values_layout;
200
202 };
203
204 template <class IT>
206
207 /*******************************************
208 * dictionary_encoded_array implementation *
209 *******************************************/
210
211 template <std::integral IT>
213 : m_proxy(std::move(proxy))
214 , m_keys_layout(create_keys_layout(m_proxy))
215 , p_values_layout(create_values_layout(m_proxy))
216 {
217 SPARROW_ASSERT_TRUE(data_type_is_integer(m_proxy.data_type()));
218 }
219
220 template <std::integral IT>
222 : m_proxy(rhs.m_proxy)
223 , m_keys_layout(create_keys_layout(m_proxy))
224 , p_values_layout(create_values_layout(m_proxy))
225 {
226 }
227
228 template <std::integral IT>
230 {
231 if (this != &rhs)
232 {
233 m_proxy = rhs.m_proxy;
234 m_keys_layout = create_keys_layout(m_proxy);
235 p_values_layout = create_values_layout(m_proxy);
236 }
237 return *this;
238 }
239
240 template <std::integral IT>
242 : m_proxy(std::move(rhs.m_proxy))
243 , m_keys_layout(create_keys_layout(m_proxy))
244 , p_values_layout(create_values_layout(m_proxy))
245 {
246 }
247
248 template <std::integral IT>
250 {
251 if (this != &rhs)
252 {
253 using std::swap;
254 swap(m_proxy, rhs.m_proxy);
255 m_keys_layout = create_keys_layout(m_proxy);
256 p_values_layout = create_values_layout(m_proxy);
257 }
258 return *this;
259 }
260
261 template <std::integral IT>
262 template <validity_bitmap_input VBI>
263 auto dictionary_encoded_array<IT>::create_proxy(
264 keys_buffer_type&& keys,
265 array&& values,
266 VBI&& validity_input,
267 std::optional<std::string_view> name,
268 std::optional<std::string_view> metadata
269 ) -> arrow_proxy
270 {
271 const auto size = keys.size();
272 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VBI>(validity_input));
273
274 auto [value_array, value_schema] = extract_arrow_structures(std::move(values));
275 const auto null_count = vbitmap.null_count();
276
277 // create arrow schema and array
280 std::move(name), // name
281 std::move(metadata), // metadata
282 std::nullopt, // flags
283 0, // n_children
284 nullptr, // children
285 new ArrowSchema(std::move(value_schema)) // dictionary
286 );
287
288 std::vector<buffer<uint8_t>> buffers(2);
289 buffers[0] = std::move(vbitmap).extract_storage();
290 buffers[1] = std::move(keys).extract_storage();
291
292 // create arrow array
294 static_cast<std::int64_t>(size), // length
295 static_cast<int64_t>(null_count),
296 0, // offset
297 std::move(buffers),
298 0, // n_children
299 nullptr, // children
300 new ArrowArray(std::move(value_array)) // dictionary
301 );
302 return arrow_proxy(std::move(arr), std::move(schema));
303 }
304
305 template <std::integral IT>
306 std::optional<std::string_view> dictionary_encoded_array<IT>::name() const
307 {
308 return m_proxy.name();
309 }
310
311 template <std::integral IT>
312 std::optional<std::string_view> dictionary_encoded_array<IT>::metadata() const
313 {
314 return m_proxy.metadata();
315 }
316
317 template <std::integral IT>
319 {
320 return m_proxy.length();
321 }
322
323 template <std::integral IT>
325 {
326 return size() == 0;
327 }
328
329 template <std::integral IT>
331 {
333 const auto index = m_keys_layout[i];
334
335 if (index.has_value())
336 {
337 SPARROW_ASSERT_TRUE(index.value() >= 0);
338 return array_element(*p_values_layout, static_cast<std::size_t>(index.value()));
339 }
340 else
341 {
342 return dummy_const_reference();
343 }
344 }
345
346 template <std::integral IT>
348 {
349 return iterator(functor_type(this), 0u);
350 }
351
352 template <std::integral IT>
354 {
355 return iterator(functor_type(this), size());
356 }
357
358 template <std::integral IT>
360 {
361 return cbegin();
362 }
363
364 template <std::integral IT>
366 {
367 return cend();
368 }
369
370 template <std::integral IT>
375
376 template <std::integral IT>
381
382 template <std::integral IT>
388
389 template <std::integral IT>
391 {
393 return operator[](size() - 1);
394 }
395
396 template <std::integral IT>
397 auto dictionary_encoded_array<IT>::dummy_inner_value() const -> const inner_value_type&
398 {
399 static const inner_value_type instance = array_default_element_value(*p_values_layout);
400 return instance;
401 }
402
403 template <std::integral IT>
405 {
406 SPARROW_ASSERT_TRUE(start <= end);
407 return self_type{get_arrow_proxy().slice(start, end)};
408 }
409
410 template <std::integral IT>
412 {
413 SPARROW_ASSERT_TRUE(start <= end);
414 return self_type{get_arrow_proxy().slice_view(start, end)};
415 }
416
417 /*template <std::integral IT>
418 auto dictionary_encoded_array<IT>::dummy_inner_const_reference() const -> inner_const_reference
419 {
420 static const inner_const_reference instance =
421 std::visit([](const auto& val) -> inner_const_reference { return val; }, dummy_inner_value());
422 return instance;
423 }*/
424
425 template <std::integral IT>
426 auto dictionary_encoded_array<IT>::dummy_const_reference() const -> const_reference
427 {
428 static const const_reference instance = std::visit(
429 [](const auto& val) -> const_reference
430 {
431 using inner_ref = typename arrow_traits<std::decay_t<decltype(val)>>::const_reference;
432 return const_reference{nullable<inner_ref>(inner_ref(val), false)};
433 },
434 dummy_inner_value()
435 );
436 return instance;
437 }
438
439 template <std::integral IT>
440 typename dictionary_encoded_array<IT>::values_layout
441 dictionary_encoded_array<IT>::create_values_layout(arrow_proxy& proxy)
442 {
443 const auto& dictionary = proxy.dictionary();
444 SPARROW_ASSERT_TRUE(dictionary);
445 arrow_proxy ar_dictionary{&(dictionary->array()), &(dictionary->schema())};
446 return array_factory(std::move(ar_dictionary));
447 }
448
449 template <std::integral IT>
450 auto dictionary_encoded_array<IT>::create_keys_layout(arrow_proxy& proxy) -> keys_layout
451 {
452 return keys_layout{arrow_proxy{&proxy.array(), &proxy.schema()}};
453 }
454
455 template <std::integral IT>
456 auto dictionary_encoded_array<IT>::get_arrow_proxy() -> arrow_proxy&
457 {
458 return m_proxy;
459 }
460
461 template <std::integral IT>
462 auto dictionary_encoded_array<IT>::get_arrow_proxy() const -> const arrow_proxy&
463 {
464 return m_proxy;
465 }
466
467 template <class IT>
469 {
470 return std::ranges::equal(lhs, rhs);
471 }
472}
473
474#if defined(__cpp_lib_format)
475template <std::integral IT>
476struct std::formatter<sparrow::dictionary_encoded_array<IT>>
477{
478 constexpr auto parse(std::format_parse_context& ctx)
479 {
480 return ctx.begin(); // Simple implementation
481 }
482
483 auto format(const sparrow::dictionary_encoded_array<IT>& ar, std::format_context& ctx) const
484 {
485 std::format_to(ctx.out(), "Dictionary [size={}] <", ar.size());
486 std::for_each(
487 ar.cbegin(),
488 std::prev(ar.cend()),
489 [&ctx](const auto& value)
490 {
491 std::format_to(ctx.out(), "{}, ", value);
492 }
493 );
494 std::format_to(ctx.out(), "{}>", ar.back());
495 return ctx.out();
496 }
497};
498
499template <std::integral IT>
500std::ostream& operator<<(std::ostream& os, const sparrow::dictionary_encoded_array<IT>& value)
501{
502 os << std::format("{}", value);
503 return os;
504}
505#endif
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:39
Proxy class over ArrowArray and ArrowSchema.
SPARROW_API std::optional< std::string_view > name() const
Smart pointer behaving like a copiable std::unique_ptr.
Definition memory.hpp:127
self_type & operator=(const self_type &)
functor_index_iterator< const_functor_type > const_iterator
std::optional< std::string_view > metadata() const
self_type slice_view(size_type start, size_type end) const
Slices the array to keep only the elements between the given start and end.
layout_element_functor< self_type, true > functor_type
std::optional< std::string_view > name() const
layout_element_functor< self_type, true > const_functor_type
self_type slice(size_type start, size_type end) const
Slices the array to keep only the elements between the given start and end.
functor_index_iterator< functor_type > iterator
const_reference operator[](size_type i) const
constexpr size_type null_count() const noexcept
constexpr layout_element_functor(storage_type layout_)
std:: conditional_t< is_const, typename layout_type::const_reference, typename layout_type::reference > return_type
constexpr layout_element_functor()=default
std::conditional_t< is_const, const layout_type *, layout_type > storage_type
return_type operator()(std::size_t i) const
The nullable class models a value or a reference that can be "null", or missing, like values traditio...
Definition nullable.hpp:280
#define SPARROW_ASSERT_TRUE(expr__)
#define SPARROW_ASSERT_FALSE(expr__)
constexpr bool excludes_copy_and_move_ctor_v
Definition mp_utils.hpp:507
constexpr std::string_view data_type_format_of()
ArrowSchema make_arrow_schema(F format, N name, M metadata, std::optional< ArrowFlag > flags, int64_t n_children, ArrowSchema **children, ArrowSchema *dictionary)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
SPARROW_API bool operator==(const array &lhs, const array &rhs)
Compares the content of two arrays.
constexpr bool is_dictionary_encoded_array_v
Checks whether T is a dictionary_encoded_array type.
SPARROW_API void swap(ArrowArray &lhs, ArrowArray &rhs)
Swaps the contents of the two ArrowArray objects.
SPARROW_API array_traits::inner_value_type array_default_element_value(const array_wrapper &ar)
std::pair< ArrowArray, ArrowSchema > extract_arrow_structures(A &&a)
Extracts the internal ArrowArrays and ArrowSchema structures from the given array or typed layout.
Definition array.hpp:91
constexpr bool data_type_is_integer(data_type dt)
dynamic_bitset< std::uint8_t > validity_bitmap
SPARROW_API cloning_ptr< array_wrapper > array_factory(arrow_proxy proxy)
SPARROW_API array_traits::const_reference array_element(const array_wrapper &ar, std::size_t index)
std::ostream & operator<<(std::ostream &stream, T n)
Definition large_int.hpp:93
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
array_trivial_copyable< T > primitive_array
Array of values of whose type has fixed binary size.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, size_t n_children, ArrowArray **children, ArrowArray *dictionary)
Creates an ArrowArray.
mpl::rename< mpl::transform< detail::array_const_reference_t, all_base_types_t >, nullable_variant > const_reference
mpl::rename< all_base_types_t, std::variant > inner_value_type
mpl::rename< mpl::transform< detail::array_value_type_t, all_base_types_t >, nullable_variant > value_type
Provides compile-time information about Arrow data types.