sparrow 0.6.0
Loading...
Searching...
No Matches
variable_size_binary_view_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <cstddef>
18#include <ranges>
19
33
34namespace sparrow
35{
36
37
38 template <class T>
42
43 namespace detail
44 {
45 template <class T>
46 struct get_data_type_from_array;
47
48 template <>
50 {
51 [[nodiscard]] static constexpr sparrow::data_type get()
52 {
54 }
55 };
56
57 template <>
59 {
60 [[nodiscard]] static constexpr sparrow::data_type get()
61 {
63 }
64 };
65 }
66
67 template <class T>
80
81 template <class T>
83 {
84 };
85
86 template <class T>
90
94 template <class T>
96
97 template <class T>
99 : public mutable_array_bitmap_base<variable_size_binary_view_array_impl<T>>
100 {
101 public:
102
105
107 using inner_value_type = typename inner_types::inner_value_type;
108 using inner_reference = typename inner_types::inner_reference;
109 using inner_const_reference = typename inner_types::inner_const_reference;
110
112 using bitmap_reference = typename base_type::bitmap_reference;
116 using bitmap_range = typename base_type::bitmap_range;
118
122
126
127 using value_iterator = typename base_type::value_iterator;
128 using const_value_iterator = typename base_type::const_value_iterator;
129
130 using iterator = typename base_type::iterator;
131 using const_iterator = typename base_type::const_iterator;
132
134
135 template <class... Args>
138 : variable_size_binary_view_array_impl(create_proxy(std::forward<Args>(args)...))
139 {
140 }
141
142 private:
143
144 template <std::ranges::input_range R, validity_bitmap_input VB = validity_bitmap, input_metadata_container METADATA_RANGE>
145 requires std::convertible_to<std::ranges::range_value_t<R>, T>
146 [[nodiscard]] static arrow_proxy create_proxy(
147 R&& range,
148 VB&& bitmap_input = validity_bitmap{},
149 std::optional<std::string_view> name = std::nullopt,
150 std::optional<METADATA_RANGE> metadata = std::nullopt
151 );
152
153 [[nodiscard]] inner_reference value(size_type i);
154 [[nodiscard]] inner_const_reference value(size_type i) const;
155
156 [[nodiscard]] value_iterator value_begin();
157 [[nodiscard]] value_iterator value_end();
158
159 [[nodiscard]] const_value_iterator value_cbegin() const;
160 [[nodiscard]] const_value_iterator value_cend() const;
161
162 static constexpr size_type LENGTH_BUFFER_INDEX = 1;
163 static constexpr std::size_t DATA_BUFFER_SIZE = 16;
164 static constexpr std::size_t SHORT_STRING_SIZE = 12;
165 static constexpr std::size_t PREFIX_SIZE = 4;
166 static constexpr std::ptrdiff_t PREFIX_OFFSET = 4;
167 static constexpr std::ptrdiff_t SHORT_STRING_OFFSET = 4;
168 static constexpr std::ptrdiff_t BUFFER_INDEX_OFFSET = 8;
169 static constexpr std::ptrdiff_t BUFFER_OFFSET_OFFSET = 12;
170 static constexpr std::size_t FIRST_VAR_DATA_BUFFER_INDEX = 2;
171
172 friend base_type;
174 };
175
176 template <class T>
181
182 template <class T>
183 template <std::ranges::input_range R, validity_bitmap_input VB, input_metadata_container METADATA_RANGE>
184 requires std::convertible_to<std::ranges::range_value_t<R>, T>
185 arrow_proxy variable_size_binary_view_array_impl<T>::create_proxy(
186 R&& range,
187 VB&& validity_input,
188 std::optional<std::string_view> name,
189 std::optional<METADATA_RANGE> metadata
190 )
191 {
192#ifdef __GNUC__
193# pragma GCC diagnostic push
194# pragma GCC diagnostic ignored "-Wcast-align"
195#endif
196
197 const auto size = range_size(range);
198 validity_bitmap vbitmap = ensure_validity_bitmap(size, std::forward<VB>(validity_input));
199 const auto null_count = vbitmap.null_count();
200
201 buffer<uint8_t> length_buffer(size * DATA_BUFFER_SIZE);
202
203 std::size_t long_string_storage_size = 0;
204 std::size_t i = 0;
205 for (auto&& val : range)
206 {
207 auto val_casted = val
208 | std::ranges::views::transform(
209 [](const auto& v)
210 {
211 return static_cast<std::uint8_t>(v);
212 }
213 );
214
215 const auto length = val.size();
216 auto length_ptr = length_buffer.data() + (i * DATA_BUFFER_SIZE);
217
218 // write length
219 *reinterpret_cast<std::int32_t*>(length_ptr) = static_cast<std::int32_t>(length);
220
221 if (length <= SHORT_STRING_SIZE)
222 {
223 // write data itself
224 sparrow::ranges::copy(val_casted, length_ptr + SHORT_STRING_OFFSET);
225 }
226 else
227 {
228 // write the prefix of the data
229 auto prefix_sub_range = val_casted | std::ranges::views::take(PREFIX_SIZE);
230 sparrow::ranges::copy(prefix_sub_range, length_ptr + PREFIX_OFFSET);
231
232 // write the buffer index
233 *reinterpret_cast<std::int32_t*>(
234 length_ptr + BUFFER_INDEX_OFFSET
235 ) = static_cast<std::int32_t>(FIRST_VAR_DATA_BUFFER_INDEX);
236
237 // write the buffer offset
238 *reinterpret_cast<std::int32_t*>(
239 length_ptr + BUFFER_OFFSET_OFFSET
240 ) = static_cast<std::int32_t>(long_string_storage_size);
241
242 // count the size of the long string storage
243 long_string_storage_size += length;
244 }
245 ++i;
246 }
247
248 // write the long string storage
249 buffer<uint8_t> long_string_storage(long_string_storage_size);
250 std::size_t long_string_storage_offset = 0;
251 for (auto&& val : range)
252 {
253 const auto length = val.size();
254 if (length > SHORT_STRING_SIZE)
255 {
256 auto val_casted = val
257 | std::ranges::views::transform(
258 [](const auto& v)
259 {
260 return static_cast<std::uint8_t>(v);
261 }
262 );
263
264 sparrow::ranges::copy(val_casted, long_string_storage.data() + long_string_storage_offset);
265 long_string_storage_offset += length;
266 }
267 }
268
269 // For binary or utf-8 view arrays, an extra buffer is appended which stores
270 // the lengths of each variadic data buffer as int64_t.
271 // This buffer is necessary since these buffer lengths are not trivially
272 // extractable from other data in an array of binary or utf-8 view type.
273 u8_buffer<int64_t> buffer_sizes(
274 static_cast<std::size_t>(1),
275 static_cast<int64_t>(long_string_storage_size)
276 );
277
279
280 // create arrow schema and array
281 ArrowSchema schema = make_arrow_schema(
282 std::is_same<T, std::string_view>::value ? std::string_view("vu") : std::string_view("vz"),
283 std::move(name), // name
284 std::move(metadata), // metadata
285 std::nullopt, // flags
286 nullptr, // children
288 nullptr, // dictionary
289 true
290 );
291
292 std::vector<buffer<uint8_t>> buffers{
293 std::move(vbitmap).extract_storage(),
294 std::move(length_buffer),
295 std::move(long_string_storage),
296 std::move(buffer_sizes).extract_storage()
297 };
298
299 // create arrow array
300 ArrowArray arr = make_arrow_array(
301 static_cast<std::int64_t>(size), // length
302 static_cast<int64_t>(null_count),
303 0, // offset
304 std::move(buffers),
305 nullptr, // children
307 nullptr, // dictionary
308 true
309 );
310
311 return arrow_proxy{std::move(arr), std::move(schema)};
312
313#ifdef __GNUC__
314# pragma GCC diagnostic pop
315#endif
316 }
317
318 template <class T>
319 auto variable_size_binary_view_array_impl<T>::value(size_type i) -> inner_reference
320 {
321 return static_cast<const self_type*>(this)->value(i);
322 }
323
324 template <class T>
325 auto variable_size_binary_view_array_impl<T>::value(size_type i) const -> inner_const_reference
326 {
327#ifdef __GNUC__
328# pragma GCC diagnostic push
329# pragma GCC diagnostic ignored "-Wcast-align"
330#endif
331
332 SPARROW_ASSERT_TRUE(i < this->size());
333
334 constexpr std::size_t element_size = 16;
335 auto data_ptr = this->get_arrow_proxy().buffers()[LENGTH_BUFFER_INDEX].template data<uint8_t>()
336 + (i * element_size);
337
338 auto length = static_cast<std::size_t>(*reinterpret_cast<const std::int32_t*>(data_ptr));
339 using char_or_byte = typename inner_const_reference::value_type;
340
341 if (length <= 12)
342 {
343 constexpr std::ptrdiff_t data_offset = 4;
344 auto ptr = reinterpret_cast<const char_or_byte*>(data_ptr);
345 const auto ret = inner_const_reference(ptr + data_offset, length);
346 return ret;
347 }
348 else
349 {
350 constexpr std::ptrdiff_t buffer_index_offset = 8;
351 constexpr std::ptrdiff_t buffer_offset_offset = 12;
352 auto buffer_index = static_cast<std::size_t>(
353 *reinterpret_cast<const std::int32_t*>(data_ptr + buffer_index_offset)
354 );
355 auto buffer_offset = static_cast<std::size_t>(
356 *reinterpret_cast<const std::int32_t*>(data_ptr + buffer_offset_offset)
357 );
358 auto buffer = this->get_arrow_proxy().buffers()[buffer_index].template data<const char_or_byte>();
359 return inner_const_reference(buffer + buffer_offset, length);
360 }
361
362#ifdef __GNUC__
363# pragma GCC diagnostic pop
364#endif
365 }
366
367 template <class T>
368 auto variable_size_binary_view_array_impl<T>::value_begin() -> value_iterator
369 {
371 }
372
373 template <class T>
374 auto variable_size_binary_view_array_impl<T>::value_end() -> value_iterator
375 {
376 return value_iterator(detail::layout_value_functor<self_type, inner_value_type>(this), this->size());
377 }
378
379 template <class T>
380 auto variable_size_binary_view_array_impl<T>::value_cbegin() const -> const_value_iterator
381 {
382 return const_value_iterator(detail::layout_value_functor<const self_type, inner_value_type>(this), 0);
383 }
384
385 template <class T>
386 auto variable_size_binary_view_array_impl<T>::value_cend() const -> const_value_iterator
387 {
388 return const_value_iterator(
390 this->size()
391 );
392 }
393}
typename base_type::const_bitmap_range const_bitmap_range
typename base_type::bitmap_iterator bitmap_iterator
typename base_type::iterator_tag iterator_tag
typename base_type::const_bitmap_iterator const_bitmap_iterator
std::conditional_t< is_mutable, mutable_array_base< D >, array_crtp_base< D > > base_type
typename base_type::bitmap_const_reference bitmap_const_reference
typename base_type::bitmap_type bitmap_type
typename base_type::difference_type difference_type
Proxy class over ArrowArray and ArrowSchema.
Object that owns a piece of contiguous memory.
Definition buffer.hpp:109
constexpr size_type null_count() const noexcept
The nullable class models a value or a reference that can be "null", or missing, like values traditio...
Definition nullable.hpp:280
A view that repeats a value a given number of times.
This buffer class is use as storage buffer for all sparrow arrays.
Definition u8_buffer.hpp:75
nullable< inner_const_reference, bitmap_const_reference > const_reference
variable_size_binary_view_array_impl< std::string_view > self_type
#define SPARROW_ASSERT_TRUE(expr__)
constexpr std::size_t size(typelist< T... >={})
Definition mp_utils.hpp:107
constexpr bool excludes_copy_and_move_ctor_v
Definition mp_utils.hpp:507
constexpr std::ranges::copy_result< std::ranges::borrowed_iterator_t< R >, O > copy(R &&r, O result)
Definition ranges.hpp:118
array_bitmap_base_impl< D, true > mutable_array_bitmap_base
Convenient typedef to be used as a crtp base class for arrays using a mutable validity buffer.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
constexpr bool is_variable_size_binary_view_array
Checks whether T is a variable_size_binary_view_array_impl type.
variable_size_binary_view_array_impl< std::span< const std::byte > > binary_view_array
variable_size_binary_view_array_impl< std::string_view > string_view_array
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
dynamic_bitset< std::uint8_t > validity_bitmap
validity_bitmap ensure_validity_bitmap(std::size_t size, R &&validity_input)
std::size_t range_size(R &&r)
Definition ranges.hpp:33
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
functor_index_iterator< detail::layout_value_functor< array_type, inner_value_type > > value_iterator
functor_index_iterator< detail::layout_value_functor< const array_type, inner_reference > > const_value_iterator
Base class for array_inner_types specialization.
Traits class that must be specialized by array classes inheriting from array_crtp_base.