sparrow 0.6.0
Loading...
Searching...
No Matches
fixed_width_binary_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstdint>
18#include <iterator>
19#include <ranges>
20#include <string>
21#include <type_traits>
22#include <vector>
23
35
36namespace sparrow
37{
38 template <std::ranges::sized_range T, class CR>
40
42
44 fixed_width_binary_traits::value_type,
45 fixed_width_binary_traits::const_reference>;
46
47 template <std::ranges::sized_range T, class CR>
74
75 namespace detail
76 {
77 template <class T>
78 struct get_data_type_from_array;
79
80 template <>
82 {
83 [[nodiscard]] static constexpr sparrow::data_type get()
84 {
86 }
87 };
88 }
89
90 template <std::ranges::sized_range T, class CR>
92 : public mutable_array_bitmap_base<fixed_width_binary_array_impl<T, CR>>
93 {
94 private:
95
96 static_assert(
97 sizeof(std::ranges::range_value_t<T>) == sizeof(byte_t),
98 "Only sequences of types with the same size as byte_t are supported"
99 );
100
101 public:
102
105
107 using inner_value_type = typename inner_types::inner_value_type;
108 using inner_reference = typename inner_types::inner_reference;
109 using inner_const_reference = typename inner_types::inner_const_reference;
110
112 using bitmap_reference = typename base_type::bitmap_reference;
115
119
123 using data_iterator = typename inner_types::data_iterator;
124
125 using const_data_iterator = typename inner_types::const_data_iterator;
126 using data_value_type = typename inner_types::data_value_type;
127
128 using value_iterator = typename inner_types::value_iterator;
129 using const_value_iterator = typename inner_types::const_value_iterator;
130
131 using functor_type = typename inner_types::functor_type;
132 using const_functor_type = typename inner_types::const_functor_type;
133
135
140 template <class... ARGS>
143 : base_type(create_proxy(std::forward<ARGS>(args)...))
144 , m_element_size(num_bytes_for_fixed_sized_binary(this->get_arrow_proxy().format()))
145 {
146 }
147
148 using base_type::get_arrow_proxy;
149 using base_type::size;
150
152 [[nodiscard]] inner_const_reference value(size_type i) const;
153
154 private:
155
166 template <
169 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
170 [[nodiscard]] static arrow_proxy create_proxy(
171 u8_buffer<C>&& data_buffer,
172 size_t element_size,
173 VB&& validity_input = validity_bitmap{},
174 std::optional<std::string_view> name = std::nullopt,
175 std::optional<METADATA_RANGE> metadata = std::nullopt
176 );
177
187 template <
188 std::ranges::input_range R,
190 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
191 requires(
192 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
194 // range of
195 // char-like
196 )
197 [[nodiscard]] static arrow_proxy create_proxy(
198 R&& values,
199 VB&& validity_input = validity_bitmap{},
200 std::optional<std::string_view> name = std::nullopt,
201 std::optional<METADATA_RANGE> metadata = std::nullopt
202 );
203
213 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
215 && std::ranges::input_range<typename std::ranges::range_value_t<R>::value_type>
216 && std::is_same_v<std::ranges::range_value_t<typename std::ranges::range_value_t<R>::value_type>, byte_t>
217 [[nodiscard]] static arrow_proxy create_proxy(
218 R&&,
219 std::optional<std::string_view> name = std::nullopt,
220 std::optional<METADATA_RANGE> metadata = std::nullopt
221 );
222
223 static constexpr size_t DATA_BUFFER_INDEX = 1;
224
225 [[nodiscard]] data_iterator data(size_type i);
226
227 [[nodiscard]] value_iterator value_begin();
228 [[nodiscard]] value_iterator value_end();
229
230 [[nodiscard]] const_value_iterator value_cbegin() const;
231 [[nodiscard]] const_value_iterator value_cend() const;
232
233 [[nodiscard]] const_data_iterator data(size_type i) const;
234
235 // Modifiers
236
237 template <std::ranges::sized_range U>
238 requires mpl::convertible_ranges<U, T>
239 void resize_values(size_type new_length, U value);
240
241 template <std::ranges::sized_range U>
242 requires mpl::convertible_ranges<U, T>
243 value_iterator insert_value(const_value_iterator pos, U value, size_type count);
244
245 template <typename InputIt>
246 requires std::input_iterator<InputIt>
247 && mpl::convertible_ranges<typename std::iterator_traits<InputIt>::value_type, T>
248 value_iterator insert_values(const_value_iterator pos, InputIt first, InputIt last);
249
250 value_iterator erase_values(const_value_iterator pos, size_type count);
251
252 template <std::ranges::sized_range U>
253 requires mpl::convertible_ranges<U, T>
254 void assign(U&& rhs, size_type index);
255
256 size_t m_element_size = 0;
257
260 friend base_type;
263 };
264
265 /************************************************
266 * fixed_width_binary_array_impl implementation *
267 ************************************************/
268
269 template <std::ranges::sized_range T, class CR>
271 : base_type(std::move(proxy))
272 , m_element_size(num_bytes_for_fixed_sized_binary(this->get_arrow_proxy().format()))
273 {
274 SPARROW_ASSERT_TRUE(this->get_arrow_proxy().data_type() == data_type::FIXED_WIDTH_BINARY);
275 }
276
277 template <std::ranges::sized_range T, class CR>
278 template <mpl::char_like C, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
279 arrow_proxy fixed_width_binary_array_impl<T, CR>::create_proxy(
280 u8_buffer<C>&& data_buffer,
281 size_t element_size,
282 VB&& validity_input,
283 std::optional<std::string_view> name,
284 std::optional<METADATA_RANGE> metadata
285 )
286 {
287 SPARROW_ASSERT_TRUE((data_buffer.size() % element_size) == 0);
288 const size_t element_count = data_buffer.size() / element_size;
289 validity_bitmap vbitmap = ensure_validity_bitmap(element_count, std::forward<VB>(validity_input));
290 const auto null_count = vbitmap.null_count();
291
292 std::string format_str = "w:" + std::to_string(element_size);
293
295 std::move(format_str),
296 std::move(name), // name
297 std::move(metadata), // metadata
298 std::nullopt, // flags,
299 nullptr, // children
300 repeat_view<bool>(true, 0), // children_ownership
301 nullptr, // dictionary
302 true // dictionary ownership
303
304 );
305 std::vector<buffer<std::uint8_t>> arr_buffs = {
306 std::move(vbitmap).extract_storage(),
307 std::move(data_buffer).extract_storage()
308 };
309
311 static_cast<std::int64_t>(element_count), // length
312 static_cast<int64_t>(null_count),
313 0, // offset
314 std::move(arr_buffs),
315 nullptr, // children
316 repeat_view<bool>(true, 0), // children_ownership
317 nullptr, // dictionary
318 true // dictionary ownership
319 );
320 return arrow_proxy{std::move(arr), std::move(schema)};
321 }
322
323 template <std::ranges::sized_range T, class CR>
324 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
325 requires(
326 std::ranges::input_range<std::ranges::range_value_t<R>> && // a range of ranges
328 // range of char-like
329 )
330 arrow_proxy fixed_width_binary_array_impl<T, CR>::create_proxy(
331 R&& values,
332 VB&& validity_input,
333 std::optional<std::string_view> name,
334 std::optional<METADATA_RANGE> metadata
335 )
336 {
337 using values_type = std::ranges::range_value_t<R>;
338 using values_inner_value_type = std::ranges::range_value_t<values_type>;
339
340 SPARROW_ASSERT_TRUE(!std::ranges::empty(values));
342 const size_t element_size = std::ranges::size(*values.begin());
343
344 auto data_buffer = u8_buffer<values_inner_value_type>(std::ranges::views::join(values));
345 return create_proxy(
346 std::move(data_buffer),
347 element_size,
348 std::forward<VB>(validity_input),
349 std::forward<std::optional<std::string_view>>(name),
350 std::forward<std::optional<METADATA_RANGE>>(metadata)
351 );
352 }
353
354 template <std::ranges::sized_range T, class CR>
355 template <std::ranges::input_range R, input_metadata_container METADATA_RANGE>
357 && std::ranges::input_range<typename std::ranges::range_value_t<R>::value_type>
358 && std::is_same_v<std::ranges::range_value_t<typename std::ranges::range_value_t<R>::value_type>, byte_t>
359 arrow_proxy fixed_width_binary_array_impl<T, CR>::create_proxy(
360 R&& range,
361 std::optional<std::string_view> name,
362 std::optional<METADATA_RANGE> metadata
363 )
364 {
365 // split into values and is_non_null ranges
366 const auto values = range
367 | std::views::transform(
368 [](const auto& v)
369 {
370 return v.get();
371 }
372 );
373 const auto is_non_null = range
374 | std::views::transform(
375 [](const auto& v)
376 {
377 return v.has_value();
378 }
379 );
380 return self_type::create_proxy(values, is_non_null, std::move(name), std::move(metadata));
381 }
382
383 template <std::ranges::sized_range T, class CR>
384 auto fixed_width_binary_array_impl<T, CR>::data(size_type i) -> data_iterator
385 {
386 const arrow_proxy& proxy = this->get_arrow_proxy();
387 auto data_buffer = proxy.buffers()[DATA_BUFFER_INDEX];
388 const size_t data_buffer_size = data_buffer.size();
389 const size_type index_offset = (static_cast<size_type>(proxy.offset()) * m_element_size) + i;
390 SPARROW_ASSERT_TRUE(data_buffer_size >= index_offset);
391 return data_buffer.template data<data_value_type>() + index_offset;
392 }
393
394 template <std::ranges::sized_range T, class CR>
395 auto fixed_width_binary_array_impl<T, CR>::data(size_type i) const -> const_data_iterator
396 {
397 const arrow_proxy& proxy = this->get_arrow_proxy();
398 const auto data_buffer = proxy.buffers()[DATA_BUFFER_INDEX];
399 const size_t data_buffer_size = data_buffer.size();
400 const size_type index_offset = (static_cast<size_type>(proxy.offset()) * m_element_size) + i;
401 SPARROW_ASSERT_TRUE(data_buffer_size >= index_offset);
402 return data_buffer.template data<const data_value_type>() + index_offset;
403 }
404
405 template <std::ranges::sized_range T, class CR>
406 template <std::ranges::sized_range U>
408 void fixed_width_binary_array_impl<T, CR>::assign(U&& rhs, size_type index)
409 {
410 SPARROW_ASSERT_TRUE(std::ranges::size(rhs) == m_element_size);
411 SPARROW_ASSERT_TRUE(index < size());
412 std::copy(std::ranges::begin(rhs), std::ranges::end(rhs), data(index * m_element_size));
413 }
414
415 template <std::ranges::sized_range T, class CR>
421
422 template <std::ranges::sized_range T, class CR>
424 {
425 SPARROW_ASSERT_TRUE(i < this->size());
426 const auto offset_begin = i * m_element_size;
427 const auto offset_end = offset_begin + m_element_size;
428 const const_data_iterator pointer_begin = data(static_cast<size_type>(offset_begin));
429 const const_data_iterator pointer_end = data(static_cast<size_type>(offset_end));
430 return inner_const_reference(pointer_begin, pointer_end);
431 }
432
433 template <std::ranges::sized_range T, class CR>
434 auto fixed_width_binary_array_impl<T, CR>::value_begin() -> value_iterator
435 {
436 return value_iterator{functor_type{&(this->derived_cast())}, 0};
437 }
438
439 template <std::ranges::sized_range T, class CR>
440 auto fixed_width_binary_array_impl<T, CR>::value_end() -> value_iterator
441 {
442 return sparrow::next(value_begin(), size());
443 }
444
445 template <std::ranges::sized_range T, class CR>
446 auto fixed_width_binary_array_impl<T, CR>::value_cbegin() const -> const_value_iterator
447 {
448 return const_value_iterator{const_functor_type{&(this->derived_cast())}, 0};
449 }
450
451 template <std::ranges::sized_range T, class CR>
452 auto fixed_width_binary_array_impl<T, CR>::value_cend() const -> const_value_iterator
453 {
454 return sparrow::next(value_cbegin(), this->size());
455 }
456
457 template <std::ranges::sized_range T, class CR>
458 template <std::ranges::sized_range U>
460 void fixed_width_binary_array_impl<T, CR>::resize_values(size_type new_length, U value)
461 {
462 SPARROW_ASSERT_TRUE(m_element_size == value.size());
463 if (new_length < size())
464 {
465 arrow_proxy& proxy = this->get_arrow_proxy();
466 const size_t new_size = new_length + static_cast<size_t>(proxy.offset());
467 const auto offset = new_size * m_element_size;
468 auto& data_buffer = proxy.get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
469 data_buffer.resize(offset);
470 }
471 else if (new_length > size())
472 {
473 insert_value(value_cend(), value, new_length - size());
474 }
475 }
476
477 template <std::ranges::sized_range T, class CR>
478 template <std::ranges::sized_range U>
480 auto fixed_width_binary_array_impl<T, CR>::insert_value(const_value_iterator pos, U value, size_type count)
482 {
483 SPARROW_ASSERT_TRUE(m_element_size == value.size());
484 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
485
486 const uint8_t* uint8_ptr = reinterpret_cast<const uint8_t*>(value.data());
487 const std::vector<uint8_t> casted_value(uint8_ptr, uint8_ptr + value.size());
488 const repeat_view<std::vector<uint8_t>> my_repeat_view{casted_value, count};
489 const auto joined_repeated_value_range = std::ranges::views::join(my_repeat_view);
490 arrow_proxy& proxy = this->get_arrow_proxy();
491 auto& data_buffer = proxy.get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
492 const auto offset_begin = (idx + proxy.offset()) * m_element_size;
493 const auto pos_to_insert = sparrow::next(data_buffer.cbegin(), offset_begin);
494 data_buffer.insert(pos_to_insert, joined_repeated_value_range.begin(), joined_repeated_value_range.end());
495 return sparrow::next(value_begin(), idx);
496 }
497
498 template <std::ranges::sized_range T, class CR>
499 template <typename InputIt>
500 requires std::input_iterator<InputIt>
502 auto
503 fixed_width_binary_array_impl<T, CR>::insert_values(const_value_iterator pos, InputIt first, InputIt last)
505 {
506 SPARROW_ASSERT_TRUE(value_cbegin() <= pos)
507 SPARROW_ASSERT_TRUE(pos <= value_cend());
508 SPARROW_ASSERT_TRUE(first <= last);
509 SPARROW_ASSERT_TRUE(all_same_size(std::ranges::subrange(first, last)));
510 SPARROW_ASSERT_TRUE(m_element_size == std::ranges::size(*first));
511
512 auto values = std::ranges::subrange(first, last);
513 const size_t cumulative_sizes = values.size() * m_element_size;
514 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
515 data_buffer.resize(data_buffer.size() + cumulative_sizes);
516 const auto idx = static_cast<size_t>(std::distance(value_cbegin(), pos));
517 std::span<byte_t> casted_values{reinterpret_cast<byte_t*>(data_buffer.data()), data_buffer.size()};
518 const auto offset_begin = m_element_size * (idx + get_arrow_proxy().offset());
519 auto insert_pos = sparrow::next(casted_values.begin(), offset_begin);
520
521 // Move elements to make space for the new value
522 std::move_backward(
523 insert_pos,
524 sparrow::next(casted_values.end(), -static_cast<difference_type>(cumulative_sizes)),
525 casted_values.end()
526 );
527
528 for (const auto& val : values)
529 {
530 std::copy(val.begin(), val.end(), insert_pos);
531 std::advance(insert_pos, m_element_size);
532 }
533 return sparrow::next(value_begin(), idx);
534 }
535
536 template <std::ranges::sized_range T, class CR>
537 auto fixed_width_binary_array_impl<T, CR>::erase_values(const_value_iterator pos, size_type count)
538 -> value_iterator
539 {
540 SPARROW_ASSERT_TRUE(pos >= value_cbegin());
541 SPARROW_ASSERT_TRUE(pos <= value_cend());
542 const size_t index = static_cast<size_t>(std::distance(value_cbegin(), pos));
543 if (count == 0)
544 {
545 return sparrow::next(value_begin(), index);
546 }
547 auto& data_buffer = get_arrow_proxy().get_array_private_data()->buffers()[DATA_BUFFER_INDEX];
548 const size_type byte_count = m_element_size * count;
549 const auto offset_begin = m_element_size * (index + static_cast<size_type>(get_arrow_proxy().offset()));
550 const auto offset_end = offset_begin + byte_count;
551 // move the values after the erased ones
552 std::move(
553 data_buffer.begin() + static_cast<difference_type>(offset_end),
554 data_buffer.end(),
555 data_buffer.begin() + static_cast<difference_type>(offset_begin)
556 );
557 data_buffer.resize(data_buffer.size() - byte_count);
558 return sparrow::next(value_begin(), index);
559 }
560}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::iterator_tag iterator_tag
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
constexpr BufferType & buffers() noexcept
Proxy class over ArrowArray and ArrowSchema.
SPARROW_API size_t offset() const
SPARROW_API arrow_array_private_data * get_array_private_data()
constexpr size_type null_count() const noexcept
bitset_iterator< self_type, true > const_iterator
fixed_width_binary_array_impl(ARGS &&... args)
Constructs a fixed-width binary array.
inner_const_reference value(size_type i) const
Implementation of reference to inner type used for layout L.
The nullable class models a value or a reference that can be "null", or missing, like values traditio...
Definition nullable.hpp:280
A view that repeats a value a given number of times.
This buffer class is use as storage buffer for all sparrow arrays.
Definition u8_buffer.hpp:75
Matches range types From whose elements are convertible to elements of range type To.
Definition mp_utils.hpp:450
#define SPARROW_ASSERT_TRUE(expr__)
constexpr std::size_t size(typelist< T... >={})
Definition mp_utils.hpp:107
constexpr bool excludes_copy_and_move_ctor_v
Definition mp_utils.hpp:507
constexpr bool is_type_instance_of_v
true if T is a concrete type template instanciation of U which is a type template.
Definition mp_utils.hpp:50
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient typedef to be used as a crtp base class for arrays using a mutable validity buffer.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
std::byte byte_t
constexpr bool all_same_size(const Range &range)
Definition ranges.hpp:47
arrow_traits< std::vector< byte_t > > fixed_width_binary_traits
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:503
fixed_width_binary_array_impl< fixed_width_binary_traits::value_type, fixed_width_binary_traits::const_reference > fixed_width_binary_array
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
SPARROW_API std::size_t num_bytes_for_fixed_sized_binary(std::string_view format)
Get the number of bytes for a fixed width binary layout from the ArrowArray format string.
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
detail::layout_value_functor< array_type, inner_reference > functor_type
detail::layout_value_functor< const array_type, inner_const_reference > const_functor_type
Base class for array_inner_types specialization.
Traits class that must be specialized by array classes inheriting from array_crtp_base.
Provides compile-time information about Arrow data types.