sparrow 2.2.1
C++20 idiomatic APIs for the Apache Arrow Columnar Format
Loading...
Searching...
No Matches
data_type.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <chrono>
18#include <version>
19
25
26#if defined(SPARROW_USE_DATE_POLYFILL)
27
28# include <date/tz.h>
29
30# if defined(__cpp_lib_format)
31# include <format>
32
33template <typename T>
34struct std::formatter<date::zoned_time<T>>
35{
36 constexpr auto parse(std::format_parse_context& ctx)
37 {
38 return ctx.begin(); // Simple implementation
39 }
40
41 auto format(const date::zoned_time<T>& date, std::format_context& ctx) const
42 {
43 std::ostringstream oss;
44 oss << date;
45 std::string date_str = oss.str();
46 return std::format_to(ctx.out(), "{}", date_str);
47 }
48};
49# endif
50
51#else
52namespace date = std::chrono;
53#endif
54
55#include <climits>
56#include <concepts>
57#include <cstdint>
58#include <cstring>
59#include <sstream>
60#include <string>
61
67
68
69#if __cplusplus > 202002L and defined(__STDCPP_FLOAT16_T__) and defined(__STDCPP_FLOAT32_T__) \
70 and defined(__STDCPP_FLOAT64_T__)
71# define SPARROW_STD_FIXED_FLOAT_SUPPORT
72#endif
73
74// TODO: use exclusively `std::float16_t etc. once we switch to c++23, see
75// https://en.cppreference.com/w/cpp/types/floating-point
76#if defined(SPARROW_STD_FIXED_FLOAT_SUPPORT)
77# include <stdfloat>
78#else
80#endif
81
82
83namespace sparrow
84{
85
86// TODO: use exclusively `std::float16_t etc. once we switch to c++23, see
87// https://en.cppreference.com/w/cpp/types/floating-point
88#if defined(SPARROW_STD_FIXED_FLOAT_SUPPORT)
89 using float16_t = std::float16_t;
90 using float32_t = std::float32_t;
91 using float64_t = std::float64_t;
92#else
94 using float32_t = float;
95 using float64_t = double;
96#endif
97
98 // P0355R7 (Extending chrono to Calendars and Time Zones) has not been entirely implemented in libc++ yet.
99 // See: https://libcxx.llvm.org/Status/Cxx20.html#note-p0355
100 // For now, we use HowardHinnant/date as a replacement if we are compiling with libc++.
101 // TODO: use the following once libc++ has full support for P0355R7.
102 // using timestamp = std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
103 template <typename Duration, typename TimeZonePtr = const date::time_zone*>
104 using timestamp = date::zoned_time<Duration, TimeZonePtr>;
105
106 // We need to be sure the current target platform is setup to support correctly these types.
107 static_assert(sizeof(float16_t) == 2);
108 static_assert(sizeof(float32_t) == 4);
109 static_assert(sizeof(float64_t) == 8);
110 static_assert(std::is_floating_point_v<float16_t>);
111 static_assert(std::is_floating_point_v<float32_t>);
112 static_assert(std::is_floating_point_v<float64_t>);
113 static_assert(CHAR_BIT == 8);
114
115 using byte_t = std::byte; // For now we will use this to represent raw data TODO: evaluate later if it's
116 // the right choice, switch to char if not
117
119 {
120 };
121
122 constexpr bool operator==(const null_type&, const null_type&) noexcept
123 {
124 return true;
125 }
126
184
185 // helper function to check if a string is all digits
186 [[nodiscard]] constexpr bool all_digits(const std::string_view s)
187 {
188 return !s.empty()
189 && std::find_if(
190 s.begin(),
191 s.end(),
192 [](unsigned char c)
193 {
194 return !std::isdigit(c);
195 }
196 ) == s.end();
197 }
198
199 // get the bit width for decimal value type from format
200 [[nodiscard]] SPARROW_API std::size_t num_bytes_for_decimal(const char* format);
201
204 // TODO: consider returning an optional instead
205 [[nodiscard]] constexpr data_type format_to_data_type(std::string_view format)
206 {
207 if (format.size() == 1)
208 {
209 switch (format[0])
210 {
211 case 'n':
212 return data_type::NA;
213 case 'b':
214 return data_type::BOOL;
215 case 'C':
216 return data_type::UINT8;
217 case 'c':
218 return data_type::INT8;
219 case 'S':
220 return data_type::UINT16;
221 case 's':
222 return data_type::INT16;
223 case 'I':
224 return data_type::UINT32;
225 case 'i':
226 return data_type::INT32;
227 case 'L':
228 return data_type::UINT64;
229 case 'l':
230 return data_type::INT64;
231 case 'e':
233 case 'f':
234 return data_type::FLOAT;
235 case 'g':
236 return data_type::DOUBLE;
237 case 'u':
238 return data_type::STRING;
239 case 'U':
241 case 'z':
242 return data_type::BINARY;
243 case 'Z':
245 default:
246 return data_type::NA;
247 }
248 }
249 else if (format == "vu") // string view
250 {
252 }
253 else if (format == "vz") // binary view
254 {
256 }
257 else if (format.starts_with("t"))
258 {
259 if (format == "tdD")
260 {
262 }
263 else if (format == "tdm")
264 {
266 }
267 else if (format.starts_with("tss:"))
268 {
270 }
271 else if (format.starts_with("tsm:"))
272 {
274 }
275 else if (format.starts_with("tsu:"))
276 {
278 }
279 else if (format.starts_with("tsn:"))
280 {
282 }
283 else if (format == "tDs")
284 {
286 }
287 else if (format == "tDm")
288 {
290 }
291 else if (format == "tDu")
292 {
294 }
295 else if (format == "tDn")
296 {
298 }
299 else if (format == "tiM")
300 {
302 }
303 else if (format == "tiD")
304 {
306 }
307 else if (format == "tin")
308 {
310 }
311 else if (format == "tts")
312 {
314 }
315 else if (format == "ttm")
316 {
318 }
319 else if (format == "ttu")
320 {
322 }
323 else if (format == "ttn")
324 {
326 }
327 }
328 else if (format == "+l")
329 {
330 return data_type::LIST;
331 }
332 else if (format == "+L")
333 {
335 }
336 else if (format == "+vl")
337 {
339 }
340 else if (format == "+vL")
341 {
343 }
344 else if (format.starts_with("+w:"))
345 {
347 }
348 else if (format == "+s")
349 {
350 return data_type::STRUCT;
351 }
352 else if (format == "+m")
353 {
354 return data_type::MAP;
355 }
356 else if (format.starts_with("+ud:"))
357 {
359 }
360 else if (format.starts_with("+us:"))
361 {
363 }
364 else if (format.starts_with("+r"))
365 {
367 }
368 else if (format.starts_with("d:"))
369 {
370 const auto num_bytes = num_bytes_for_decimal(format.data());
371 switch (num_bytes)
372 {
373 case 4:
375 case 8:
377 case 16:
379 case 32:
381 default:
382 throw std::runtime_error("Invalid format for decimal");
383 }
384 }
385 else if (format.starts_with("w:"))
386 {
388 }
389
390 return data_type::NA;
391 }
392
396 template <std::floating_point T>
397 requires(sizeof(T) >= 2 && sizeof(T) <= 8)
398 [[nodiscard]] constexpr data_type data_type_from_size(T = {}) noexcept
399 {
400 switch (sizeof(T))
401 {
402 case 2:
404 case 4:
405 return data_type::FLOAT;
406 case 8:
407 return data_type::DOUBLE;
408 }
409
411 }
412
416 template <std::integral T>
417 requires(sizeof(T) >= 1 && sizeof(T) <= 8)
418 [[nodiscard]] constexpr data_type data_type_from_size(T = {}) noexcept
419 {
420 if constexpr (std::same_as<bool, T>)
421 {
422 return data_type::BOOL;
423 }
424 else if constexpr (std::signed_integral<T>)
425 {
426 switch (sizeof(T))
427 {
428 case 1:
429 return data_type::INT8;
430 case 2:
431 return data_type::INT16;
432 case 4:
433 return data_type::INT32;
434 case 8:
435 return data_type::INT64;
436 }
437 }
438 else
439 {
440 static_assert(std::unsigned_integral<T>);
441 switch (sizeof(T))
442 {
443 case 1:
444 return data_type::UINT8;
445 case 2:
446 return data_type::UINT16;
447 case 4:
448 return data_type::UINT32;
449 case 8:
450 return data_type::UINT64;
451 }
452 }
453
455 }
456
457 namespace detail
458 {
472 template <class A>
474 }
475
476 // REMARK: this functions is non-applicable for the following types
477 // - all decimal types because further information is needed (precision, scale)
478 // - fixed-sized binary because further information is needed (element size)
482 [[nodiscard]] constexpr std::string_view data_type_to_format(data_type type)
483 {
484 switch (type)
485 {
486 case data_type::NA:
487 return "n";
488 case data_type::BOOL:
489 return "b";
490 case data_type::UINT8:
491 return "C";
492 case data_type::INT8:
493 return "c";
495 return "S";
496 case data_type::INT16:
497 return "s";
499 return "I";
500 case data_type::INT32:
501 return "i";
503 return "L";
504 case data_type::INT64:
505 return "l";
507 return "e";
508 case data_type::FLOAT:
509 return "f";
511 return "g";
513 return "u";
515 return "U";
517 return "z";
519 return "Z";
521 return "vu";
523 return "vz";
525 return "tdD";
527 return "tdm";
529 return "tss:";
531 return "tsm:";
533 return "tsu:";
535 return "tsn:";
537 return "tDs";
539 return "tDm";
541 return "tDu";
543 return "tDn";
545 return "tiM";
547 return "tiD";
549 return "tin";
551 return "tts";
553 return "ttm";
555 return "ttu";
557 return "ttn";
558 case data_type::LIST:
559 return "+l";
561 return "+L";
563 return "+vl";
565 return "+vL";
567 return "+s";
568 case data_type::MAP:
569 return "+m";
571 return "+r";
572 default:
573 throw std::runtime_error("Unsupported data type");
574 }
575 }
576
578 [[nodiscard]] constexpr bool data_type_is_primitive(data_type dt) noexcept
579 {
580 switch (dt)
581 {
582 case data_type::BOOL:
583 case data_type::UINT8:
584 case data_type::INT8:
586 case data_type::INT16:
588 case data_type::INT32:
590 case data_type::INT64:
592 case data_type::FLOAT:
594 return true;
595 default:
596 return false;
597 }
598 }
599
601 [[nodiscard]] constexpr bool data_type_is_integer(data_type dt) noexcept
602 {
603 switch (dt)
604 {
605 case data_type::UINT8:
606 case data_type::INT8:
608 case data_type::INT16:
610 case data_type::INT32:
612 case data_type::INT64:
613 return true;
614 default:
615 return false;
616 }
617 }
618
619 class list_value;
620 class struct_value;
621 class map_value;
622
624 // NOTE: this needs to be in sync-order with `data_type`
626 null_type,
627 bool,
628 std::uint8_t,
629 std::int8_t,
630 std::uint16_t,
631 std::int16_t,
632 std::uint32_t,
633 std::int32_t,
634 std::uint64_t,
635 std::int64_t,
636 float16_t,
637 float32_t,
638 float64_t,
639 std::string,
640 std::vector<byte_t>,
641 date_days,
651 std::chrono::seconds,
652 std::chrono::milliseconds,
653 std::chrono::microseconds,
654 std::chrono::nanoseconds,
664 map_value,
669
671 template <class T>
673
675 // template <class T>
676 // concept is_arrow_base_type_or_compound = is_arrow_base_type<T> || is_list_value_v<T>;
678
687 template <class T>
689
698 template <class T>
699 using get_corresponding_arrow_type_t = std::conditional_t<std::same_as<T, std::string_view>, std::string, T>;
700
714 template <class T>
716
717 namespace detail
718 {
719 template <template <class> class>
721 {
722 };
723 }
728 template <class T>
731 typename T::value_type;
732
734 // typename detail::accepts_template<T::template default_layout>;
735
736 // TODO: add more interface requirements on the traits here
737 // TODO: add conversion operations between bytes and the value type
738 };
739
740
743 template <class T>
744 concept has_arrow_type_traits = requires { typename ::sparrow::arrow_traits<T>; }
746
749 template <class T>
751
752
754 template <has_arrow_type_traits T>
756
757 // For now, a tiny wrapper around data_type
758 // TODO: More data and functions to come
760 {
761 public:
762
763 constexpr data_descriptor() noexcept
765 {
766 }
767
768 data_descriptor(std::string_view format) noexcept
770 {
771 }
772
773 constexpr explicit data_descriptor(data_type id) noexcept
774 : m_id(id)
775 {
776 }
777
778 [[nodiscard]] constexpr data_type id() const noexcept
779 {
780 return m_id;
781 }
782
783 private:
784
785 data_type m_id;
786 };
787
788 namespace impl
789 {
790 template <class C, bool is_const>
792 : std::conditional<is_const, typename C::inner_const_reference, typename C::inner_reference>
793 {
794 };
795
796 template <class C, bool is_const>
798 } // namespace impl
799
800 template <class T>
801 concept layout_offset = std::same_as<T, std::int32_t> || std::same_as<T, std::int64_t>;
802}
803
804#if defined(__cpp_lib_format)
805
806namespace std
807{
808 template <>
809 struct formatter<sparrow::data_type>
810 {
811 constexpr auto parse(std::format_parse_context& ctx)
812 {
813 return ctx.begin(); // Simple implementation
814 }
815
816 auto format(const sparrow::data_type& data_type, std::format_context& ctx) const
817 {
818 static const auto get_enum_name = [](sparrow::data_type dt) -> std::string_view
819 {
820 using enum sparrow::data_type;
821 switch (dt)
822 {
823 case NA:
824 return "N/A";
825 case BOOL:
826 return "bool";
827 case UINT8:
828 return "uint8";
829 case INT8:
830 return "int8";
831 case UINT16:
832 return "uint16";
833 case INT16:
834 return "int16";
835 case UINT32:
836 return "uint32";
837 case INT32:
838 return "int32";
839 case UINT64:
840 return "uint64";
841 case INT64:
842 return "int64";
843 case HALF_FLOAT:
844 return "float16";
845 case FLOAT:
846 return "float32";
847 case DOUBLE:
848 return "double";
849 case STRING:
850 return "String";
851 case LARGE_STRING:
852 return "Large string";
853 case BINARY:
854 return "Binary";
855 case LARGE_BINARY:
856 return "Large binary";
857 case DATE_DAYS:
858 return "Date days";
860 return "Date milliseconds";
862 return "Timestamp seconds";
864 return "Timestamp milliseconds";
866 return "Timestamp microseconds";
868 return "Timestamp nanoseconds";
869 case DURATION_SECONDS:
870 return "Duration seconds";
872 return "Duration milliseconds";
874 return "Duration microseconds";
876 return "Duration nanoseconds";
877 case INTERVAL_MONTHS:
878 return "Interval months";
880 return "Interval days time";
882 return "Interval months days nanoseconds";
883 case TIME_SECONDS:
884 return "Time seconds";
886 return "Time milliseconds";
888 return "Time microseconds";
889 case TIME_NANOSECONDS:
890 return "Time nanoseconds";
891 case LIST:
892 return "List";
893 case LARGE_LIST:
894 return "Large list";
895 case LIST_VIEW:
896 return "List view";
897 case LARGE_LIST_VIEW:
898 return "Large list view";
899 case FIXED_SIZED_LIST:
900 return "Fixed sized list";
901 case STRUCT:
902 return "Struct";
903 case MAP:
904 return "Map";
905 case DENSE_UNION:
906 return "Dense union";
907 case SPARSE_UNION:
908 return "Sparse union";
909 case RUN_ENCODED:
910 return "Run encoded";
911 case DECIMAL32:
912 return "Decimal32";
913 case DECIMAL64:
914 return "Decimal64";
915 case DECIMAL128:
916 return "Decimal128";
917 case DECIMAL256:
918 return "Decimal256";
920 return "Fixed width binary";
921 case STRING_VIEW:
922 return "String view";
923 case BINARY_VIEW:
924 return "Binary view";
925 };
926 return "UNKNOWN";
927 };
928
929 return std::format_to(ctx.out(), "{}", get_enum_name(data_type));
930 }
931 };
932}
933
934template <>
935struct std::formatter<sparrow::null_type>
936{
937 constexpr auto parse(std::format_parse_context& ctx)
938 {
939 return ctx.begin(); // Simple implementation
940 }
941
942 auto format(const sparrow::null_type&, std::format_context& ctx) const
943 {
944 return std::format_to(ctx.out(), "null_type");
945 }
946};
947
948namespace sparrow
949{
950 inline std::ostream& operator<<(std::ostream& os, const null_type&)
951 {
952 os << std::format("{}", "null");
953 return os;
954 }
955}
956
957#endif
Half-precision floating-point type.
constexpr data_type id() const noexcept
constexpr data_descriptor(data_type id) noexcept
constexpr data_descriptor() noexcept
data_descriptor(std::string_view format) noexcept
Matches any type which is one of the base C++ types supported or at least that provides an arrow_trai...
Matches types providing valid and complete arrow_traits specialization.
Checks if a type is an extended base type for Arrow.
Matches C++ representation types which are supported by default.
Matches valid and complete arrow_traits specializations for type T.
#define SPARROW_API
Definition config.hpp:38
Main header file for half-precision functionality.
std::chrono::duration< int32_t, std::ratio< 2629746 > > months
typename get_inner_reference< C, is_const >::type get_inner_reference_t
decltype(append(TypeList{}, Us{}...)) append_t
Type alias for appending types or typelists to a given typelist.
Definition mp_utils.hpp:198
constexpr bool is_type_instance_of_v
Variable template for convenient access to is_type_instance_of.
Definition mp_utils.hpp:102
void unreachable()
Invokes undefined behavior for optimization purposes.
Definition mp_utils.hpp:882
consteval bool contains()
Checks if a typelist contains a specific type.
Definition mp_utils.hpp:633
std::byte byte_t
constexpr std::string_view data_type_to_format(data_type type)
SPARROW_API bool operator==(const array &lhs, const array &rhs)
Compares the content of two arrays.
constexpr bool data_type_is_primitive(data_type dt) noexcept
half_float::half float16_t
Definition data_type.hpp:93
SPARROW_API std::size_t num_bytes_for_decimal(const char *format)
float float32_t
Definition data_type.hpp:94
mpl::append_t< all_base_types_t, char, std::string_view > all_base_types_extended_t
is arrow base type or arrow compound type (list<T>, struct<T> etc.)
date::zoned_time< Duration, TimeZonePtr > timestamp
double float64_t
Definition data_type.hpp:95
std::ostream & operator<<(std::ostream &os, const nullval_t &)
std::conditional_t< std::same_as< T, std::string_view >, std::string, T > get_corresponding_arrow_type_t
Template alias to get the corresponding Arrow type for a given type.
typename arrow_traits< T >::default_layout default_layout_t
Binary layout type to use by default for the given C++ representation T of an arrow value.
constexpr data_type data_type_from_size(T={}) noexcept
mpl::typelist< null_type, bool, std::uint8_t, std::int8_t, std::uint16_t, std::int16_t, std::uint32_t, std::int32_t, std::uint64_t, std::int64_t, float16_t, float32_t, float64_t, std::string, std::vector< byte_t >, date_days, date_milliseconds, timestamp< std::chrono::seconds >, timestamp< std::chrono::milliseconds >, timestamp< std::chrono::microseconds >, timestamp< std::chrono::nanoseconds >, zoned_time_without_timezone_seconds, zoned_time_without_timezone_milliseconds, zoned_time_without_timezone_microseconds, zoned_time_without_timezone_nanoseconds, std::chrono::seconds, std::chrono::milliseconds, std::chrono::microseconds, std::chrono::nanoseconds, chrono::months, days_time_interval, month_day_nanoseconds_interval, chrono::time_seconds, chrono::time_milliseconds, chrono::time_microseconds, chrono::time_nanoseconds, list_value, struct_value, map_value, decimal< std::int32_t >, decimal< std::int64_t >, decimal< int128_t >, decimal< int256_t > > all_base_types_t
C++ types value representation types matching Arrow types.
constexpr bool all_digits(const std::string_view s)
std::chrono::time_point< std::chrono::system_clock, std::chrono::milliseconds > date_milliseconds
constexpr bool data_type_is_integer(data_type dt) noexcept
constexpr data_type format_to_data_type(std::string_view format)
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
std::chrono::time_point< std::chrono::system_clock, chrono::days > date_days
Extensions to the C++ standard library.
Provides compile-time information about Arrow data types.
A duration representing time elapsed since midnight, in microseconds.
A duration representing time elapsed since midnight, in milliseconds.
A duration representing time elapsed since midnight, in nanoseconds.
A duration representing time elapsed since midnight.
Metafunction for retrieving the data_type of a typed array.
A sequence of types used for metaprogramming operations.
Definition mp_utils.hpp:123
A zoned time value without timezone, in microseconds.
A zoned time value without timezone, in milliseconds.
A zoned time value without timezone, in nanoseconds.
A zoned time value without timezone, in seconds.