sparrow 2.3.1
C++20 idiomatic APIs for the Apache Arrow Columnar Format
Loading...
Searching...
No Matches
data_type.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <chrono>
18#include <version>
19
25
26#if defined(SPARROW_USE_DATE_POLYFILL)
27
28# include <date/tz.h>
29
30# if defined(__cpp_lib_format)
31# include <format>
32
33template <typename T>
34struct std::formatter<date::zoned_time<T>>
35{
36 constexpr auto parse(std::format_parse_context& ctx)
37 {
38 return ctx.begin(); // Simple implementation
39 }
40
41 auto format(const date::zoned_time<T>& date, std::format_context& ctx) const
42 {
43 std::ostringstream oss;
44 oss << date;
45 std::string date_str = oss.str();
46 return std::format_to(ctx.out(), "{}", date_str);
47 }
48};
49# endif
50
51#else
52namespace date = std::chrono;
53#endif
54
55#include <climits>
56#include <concepts>
57#include <cstdint>
58#include <cstring>
59#include <sstream>
60#include <string>
61
67
68
69#if __cplusplus > 202002L and defined(__STDCPP_FLOAT16_T__) and defined(__STDCPP_FLOAT32_T__) \
70 and defined(__STDCPP_FLOAT64_T__)
71# define SPARROW_STD_FIXED_FLOAT_SUPPORT
72#endif
73
74// TODO: use exclusively `std::float16_t etc. once we switch to c++23, see
75// https://en.cppreference.com/w/cpp/types/floating-point
76#if defined(SPARROW_STD_FIXED_FLOAT_SUPPORT)
77# include <stdfloat>
78#else
80
81namespace sparrow
82{
83 template <class T>
84 struct is_floating_point : std::is_floating_point<T>
85 {
86 };
87
88 template <>
89 struct is_floating_point<half_float::half> : std::true_type
90 {
91 };
92
93 template <class T>
94 struct is_scalar : std::is_scalar<T>
95 {
96 };
97
98 template <>
99 struct is_scalar<half_float::half> : std::true_type
100 {
101 };
102
103 template <class T>
104 struct is_signed : std::is_signed<T>
105 {
106 };
107
108 template <>
109 struct is_signed<half_float::half> : std::true_type
110 {
111 };
112
113 template <class T>
115
116 template <class T>
117 inline constexpr bool is_scalar_v = is_scalar<T>::value;
118
119 template <class T>
120 inline constexpr bool is_signed_v = is_signed<T>::value;
121}
122#endif
123
124
125namespace sparrow
126{
127
128// TODO: use exclusively `std::float16_t etc. once we switch to c++23, see
129// https://en.cppreference.com/w/cpp/types/floating-point
130#if defined(SPARROW_STD_FIXED_FLOAT_SUPPORT)
131 using float16_t = std::float16_t;
132 using float32_t = std::float32_t;
133 using float64_t = std::float64_t;
134#else
136 using float32_t = float;
137 using float64_t = double;
138#endif
139
140 // P0355R7 (Extending chrono to Calendars and Time Zones) has not been entirely implemented in libc++ yet.
141 // See: https://libcxx.llvm.org/Status/Cxx20.html#note-p0355
142 // For now, we use HowardHinnant/date as a replacement if we are compiling with libc++.
143 // TODO: use the following once libc++ has full support for P0355R7.
144 // using timestamp = std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
145 template <typename Duration, typename TimeZonePtr = const date::time_zone*>
146 using timestamp = date::zoned_time<Duration, TimeZonePtr>;
147
148 // We need to be sure the current target platform is setup to support correctly these types.
149 static_assert(sizeof(float16_t) == 2);
150 static_assert(sizeof(float32_t) == 4);
151 static_assert(sizeof(float64_t) == 8);
152#if defined(SPARROW_STD_FIXED_FLOAT_SUPPORT)
153 static_assert(std::floating_point<float16_t>);
154 static_assert(std::floating_point<float32_t>);
155 static_assert(std::floating_point<float64_t>);
156#else
160#endif
161 static_assert(CHAR_BIT == 8);
162
163 using byte_t = std::byte; // For now we will use this to represent raw data TODO: evaluate later if it's
164 // the right choice, switch to char if not
165
167 {
168 };
169
170 constexpr bool operator==(const null_type&, const null_type&) noexcept
171 {
172 return true;
173 }
174
232
233 // helper function to check if a string is all digits
234 [[nodiscard]] constexpr bool all_digits(const std::string_view s)
235 {
236 return !s.empty()
237 && std::find_if(
238 s.begin(),
239 s.end(),
240 [](unsigned char c)
241 {
242 return !std::isdigit(c);
243 }
244 ) == s.end();
245 }
246
247 // get the bit width for decimal value type from format
248 [[nodiscard]] SPARROW_API std::size_t num_bytes_for_decimal(const char* format);
249
252 // TODO: consider returning an optional instead
253 [[nodiscard]] constexpr data_type format_to_data_type(std::string_view format)
254 {
255 if (format.size() == 1)
256 {
257 switch (format[0])
258 {
259 case 'n':
260 return data_type::NA;
261 case 'b':
262 return data_type::BOOL;
263 case 'C':
264 return data_type::UINT8;
265 case 'c':
266 return data_type::INT8;
267 case 'S':
268 return data_type::UINT16;
269 case 's':
270 return data_type::INT16;
271 case 'I':
272 return data_type::UINT32;
273 case 'i':
274 return data_type::INT32;
275 case 'L':
276 return data_type::UINT64;
277 case 'l':
278 return data_type::INT64;
279 case 'e':
281 case 'f':
282 return data_type::FLOAT;
283 case 'g':
284 return data_type::DOUBLE;
285 case 'u':
286 return data_type::STRING;
287 case 'U':
289 case 'z':
290 return data_type::BINARY;
291 case 'Z':
293 default:
294 return data_type::NA;
295 }
296 }
297 else if (format == "vu") // string view
298 {
300 }
301 else if (format == "vz") // binary view
302 {
304 }
305 else if (format.starts_with("t"))
306 {
307 if (format == "tdD")
308 {
310 }
311 else if (format == "tdm")
312 {
314 }
315 else if (format.starts_with("tss:"))
316 {
318 }
319 else if (format.starts_with("tsm:"))
320 {
322 }
323 else if (format.starts_with("tsu:"))
324 {
326 }
327 else if (format.starts_with("tsn:"))
328 {
330 }
331 else if (format == "tDs")
332 {
334 }
335 else if (format == "tDm")
336 {
338 }
339 else if (format == "tDu")
340 {
342 }
343 else if (format == "tDn")
344 {
346 }
347 else if (format == "tiM")
348 {
350 }
351 else if (format == "tiD")
352 {
354 }
355 else if (format == "tin")
356 {
358 }
359 else if (format == "tts")
360 {
362 }
363 else if (format == "ttm")
364 {
366 }
367 else if (format == "ttu")
368 {
370 }
371 else if (format == "ttn")
372 {
374 }
375 }
376 else if (format == "+l")
377 {
378 return data_type::LIST;
379 }
380 else if (format == "+L")
381 {
383 }
384 else if (format == "+vl")
385 {
387 }
388 else if (format == "+vL")
389 {
391 }
392 else if (format.starts_with("+w:"))
393 {
395 }
396 else if (format == "+s")
397 {
398 return data_type::STRUCT;
399 }
400 else if (format == "+m")
401 {
402 return data_type::MAP;
403 }
404 else if (format.starts_with("+ud:"))
405 {
407 }
408 else if (format.starts_with("+us:"))
409 {
411 }
412 else if (format.starts_with("+r"))
413 {
415 }
416 else if (format.starts_with("d:"))
417 {
418 const auto num_bytes = num_bytes_for_decimal(format.data());
419 switch (num_bytes)
420 {
421 case 4:
423 case 8:
425 case 16:
427 case 32:
429 default:
430 throw std::runtime_error("Invalid format for decimal");
431 }
432 }
433 else if (format.starts_with("w:"))
434 {
436 }
437
438 return data_type::NA;
439 }
440
444 template <std::floating_point T>
445 requires(sizeof(T) >= 2 && sizeof(T) <= 8)
446 [[nodiscard]] constexpr data_type data_type_from_size(T = {}) noexcept
447 {
448 switch (sizeof(T))
449 {
450 case 2:
452 case 4:
453 return data_type::FLOAT;
454 case 8:
455 return data_type::DOUBLE;
456 }
457
459 }
460
461#if !defined(SPARROW_STD_FIXED_FLOAT_SUPPORT)
463 template <typename T>
464 requires std::same_as<T, float16_t>
465 [[nodiscard]] constexpr data_type data_type_from_size(T = {}) noexcept
466 {
468 }
469#endif
470
474 template <std::integral T>
475 requires(sizeof(T) >= 1 && sizeof(T) <= 8)
476 [[nodiscard]] constexpr data_type data_type_from_size(T = {}) noexcept
477 {
478 if constexpr (std::same_as<bool, T>)
479 {
480 return data_type::BOOL;
481 }
482 else if constexpr (std::signed_integral<T>)
483 {
484 switch (sizeof(T))
485 {
486 case 1:
487 return data_type::INT8;
488 case 2:
489 return data_type::INT16;
490 case 4:
491 return data_type::INT32;
492 case 8:
493 return data_type::INT64;
494 }
495 }
496 else
497 {
498 static_assert(std::unsigned_integral<T>);
499 switch (sizeof(T))
500 {
501 case 1:
502 return data_type::UINT8;
503 case 2:
504 return data_type::UINT16;
505 case 4:
506 return data_type::UINT32;
507 case 8:
508 return data_type::UINT64;
509 }
510 }
511
513 }
514
515 namespace detail
516 {
530 template <class A>
532 }
533
534 // REMARK: this functions is non-applicable for the following types
535 // - all decimal types because further information is needed (precision, scale)
536 // - fixed-sized binary because further information is needed (element size)
540 [[nodiscard]] constexpr std::string_view data_type_to_format(data_type type)
541 {
542 switch (type)
543 {
544 case data_type::NA:
545 return "n";
546 case data_type::BOOL:
547 return "b";
548 case data_type::UINT8:
549 return "C";
550 case data_type::INT8:
551 return "c";
553 return "S";
554 case data_type::INT16:
555 return "s";
557 return "I";
558 case data_type::INT32:
559 return "i";
561 return "L";
562 case data_type::INT64:
563 return "l";
565 return "e";
566 case data_type::FLOAT:
567 return "f";
569 return "g";
571 return "u";
573 return "U";
575 return "z";
577 return "Z";
579 return "vu";
581 return "vz";
583 return "tdD";
585 return "tdm";
587 return "tss:";
589 return "tsm:";
591 return "tsu:";
593 return "tsn:";
595 return "tDs";
597 return "tDm";
599 return "tDu";
601 return "tDn";
603 return "tiM";
605 return "tiD";
607 return "tin";
609 return "tts";
611 return "ttm";
613 return "ttu";
615 return "ttn";
616 case data_type::LIST:
617 return "+l";
619 return "+L";
621 return "+vl";
623 return "+vL";
625 return "+s";
626 case data_type::MAP:
627 return "+m";
629 return "+r";
630 default:
631 throw std::runtime_error("Unsupported data type");
632 }
633 }
634
636 [[nodiscard]] constexpr bool data_type_is_primitive(data_type dt) noexcept
637 {
638 switch (dt)
639 {
640 case data_type::BOOL:
641 case data_type::UINT8:
642 case data_type::INT8:
644 case data_type::INT16:
646 case data_type::INT32:
648 case data_type::INT64:
650 case data_type::FLOAT:
652 return true;
653 default:
654 return false;
655 }
656 }
657
659 [[nodiscard]] constexpr bool data_type_is_integer(data_type dt) noexcept
660 {
661 switch (dt)
662 {
663 case data_type::UINT8:
664 case data_type::INT8:
666 case data_type::INT16:
668 case data_type::INT32:
670 case data_type::INT64:
671 return true;
672 default:
673 return false;
674 }
675 }
676
677 class list_value;
678 class struct_value;
679 class map_value;
680
682 // NOTE: this needs to be in sync-order with `data_type`
684 null_type,
685 bool,
686 std::uint8_t,
687 std::int8_t,
688 std::uint16_t,
689 std::int16_t,
690 std::uint32_t,
691 std::int32_t,
692 std::uint64_t,
693 std::int64_t,
694 float16_t,
695 float32_t,
696 float64_t,
697 std::string,
698 std::vector<byte_t>,
699 date_days,
709 std::chrono::seconds,
710 std::chrono::milliseconds,
711 std::chrono::microseconds,
712 std::chrono::nanoseconds,
722 map_value,
727
729 template <class T>
731
733 // template <class T>
734 // concept is_arrow_base_type_or_compound = is_arrow_base_type<T> || is_list_value_v<T>;
736
745 template <class T>
747
756 template <class T>
757 using get_corresponding_arrow_type_t = std::conditional_t<std::same_as<T, std::string_view>, std::string, T>;
758
772 template <class T>
774
775 namespace detail
776 {
777 template <template <class> class>
779 {
780 };
781 }
786 template <class T>
789 typename T::value_type;
790
792 // typename detail::accepts_template<T::template default_layout>;
793
794 // TODO: add more interface requirements on the traits here
795 // TODO: add conversion operations between bytes and the value type
796 };
797
798
801 template <class T>
802 concept has_arrow_type_traits = requires { typename ::sparrow::arrow_traits<T>; }
804
807 template <class T>
809
810
812 template <has_arrow_type_traits T>
814
815 // For now, a tiny wrapper around data_type
816 // TODO: More data and functions to come
818 {
819 public:
820
821 constexpr data_descriptor() noexcept
823 {
824 }
825
826 data_descriptor(std::string_view format) noexcept
828 {
829 }
830
831 constexpr explicit data_descriptor(data_type id) noexcept
832 : m_id(id)
833 {
834 }
835
836 [[nodiscard]] constexpr data_type id() const noexcept
837 {
838 return m_id;
839 }
840
841 private:
842
843 data_type m_id;
844 };
845
846 namespace impl
847 {
848 template <class C, bool is_const>
850 : std::conditional<is_const, typename C::inner_const_reference, typename C::inner_reference>
851 {
852 };
853
854 template <class C, bool is_const>
856 } // namespace impl
857
858 template <class T>
859 concept layout_offset = std::same_as<T, std::int32_t> || std::same_as<T, std::int64_t>;
860}
861
862#if defined(__cpp_lib_format)
863
864namespace std
865{
866 template <>
867 struct formatter<sparrow::data_type>
868 {
869 constexpr auto parse(std::format_parse_context& ctx)
870 {
871 return ctx.begin(); // Simple implementation
872 }
873
874 auto format(const sparrow::data_type& data_type, std::format_context& ctx) const
875 {
876 static const auto get_enum_name = [](sparrow::data_type dt) -> std::string_view
877 {
878 using enum sparrow::data_type;
879 switch (dt)
880 {
881 case NA:
882 return "N/A";
883 case BOOL:
884 return "bool";
885 case UINT8:
886 return "uint8";
887 case INT8:
888 return "int8";
889 case UINT16:
890 return "uint16";
891 case INT16:
892 return "int16";
893 case UINT32:
894 return "uint32";
895 case INT32:
896 return "int32";
897 case UINT64:
898 return "uint64";
899 case INT64:
900 return "int64";
901 case HALF_FLOAT:
902 return "float16";
903 case FLOAT:
904 return "float32";
905 case DOUBLE:
906 return "double";
907 case STRING:
908 return "String";
909 case LARGE_STRING:
910 return "Large string";
911 case BINARY:
912 return "Binary";
913 case LARGE_BINARY:
914 return "Large binary";
915 case DATE_DAYS:
916 return "Date days";
918 return "Date milliseconds";
920 return "Timestamp seconds";
922 return "Timestamp milliseconds";
924 return "Timestamp microseconds";
926 return "Timestamp nanoseconds";
927 case DURATION_SECONDS:
928 return "Duration seconds";
930 return "Duration milliseconds";
932 return "Duration microseconds";
934 return "Duration nanoseconds";
935 case INTERVAL_MONTHS:
936 return "Interval months";
938 return "Interval days time";
940 return "Interval months days nanoseconds";
941 case TIME_SECONDS:
942 return "Time seconds";
944 return "Time milliseconds";
946 return "Time microseconds";
947 case TIME_NANOSECONDS:
948 return "Time nanoseconds";
949 case LIST:
950 return "List";
951 case LARGE_LIST:
952 return "Large list";
953 case LIST_VIEW:
954 return "List view";
955 case LARGE_LIST_VIEW:
956 return "Large list view";
957 case FIXED_SIZED_LIST:
958 return "Fixed sized list";
959 case STRUCT:
960 return "Struct";
961 case MAP:
962 return "Map";
963 case DENSE_UNION:
964 return "Dense union";
965 case SPARSE_UNION:
966 return "Sparse union";
967 case RUN_ENCODED:
968 return "Run encoded";
969 case DECIMAL32:
970 return "Decimal32";
971 case DECIMAL64:
972 return "Decimal64";
973 case DECIMAL128:
974 return "Decimal128";
975 case DECIMAL256:
976 return "Decimal256";
978 return "Fixed width binary";
979 case STRING_VIEW:
980 return "String view";
981 case BINARY_VIEW:
982 return "Binary view";
983 };
984 return "UNKNOWN";
985 };
986
987 return std::format_to(ctx.out(), "{}", get_enum_name(data_type));
988 }
989 };
990}
991
992template <>
993struct std::formatter<sparrow::null_type>
994{
995 constexpr auto parse(std::format_parse_context& ctx)
996 {
997 return ctx.begin(); // Simple implementation
998 }
999
1000 auto format(const sparrow::null_type&, std::format_context& ctx) const
1001 {
1002 return std::format_to(ctx.out(), "null_type");
1003 }
1004};
1005
1006namespace sparrow
1007{
1008 inline std::ostream& operator<<(std::ostream& os, const null_type&)
1009 {
1010 os << std::format("{}", "null");
1011 return os;
1012 }
1013}
1014
1015#endif
Half-precision floating-point type.
constexpr data_type id() const noexcept
constexpr data_descriptor(data_type id) noexcept
constexpr data_descriptor() noexcept
data_descriptor(std::string_view format) noexcept
Matches any type which is one of the base C++ types supported or at least that provides an arrow_trai...
Matches types providing valid and complete arrow_traits specialization.
Checks if a type is an extended base type for Arrow.
Matches C++ representation types which are supported by default.
Matches valid and complete arrow_traits specializations for type T.
#define SPARROW_API
Definition config.hpp:38
Main header file for half-precision functionality.
Main namespace for half-precision functionality.
std::chrono::duration< int32_t, std::ratio< 2629746 > > months
typename get_inner_reference< C, is_const >::type get_inner_reference_t
decltype(append(TypeList{}, Us{}...)) append_t
Type alias for appending types or typelists to a given typelist.
Definition mp_utils.hpp:198
constexpr bool is_type_instance_of_v
Variable template for convenient access to is_type_instance_of.
Definition mp_utils.hpp:102
void unreachable()
Invokes undefined behavior for optimization purposes.
Definition mp_utils.hpp:882
consteval bool contains()
Checks if a typelist contains a specific type.
Definition mp_utils.hpp:633
std::byte byte_t
constexpr std::string_view data_type_to_format(data_type type)
SPARROW_API bool operator==(const array &lhs, const array &rhs)
Compares the content of two arrays.
constexpr bool data_type_is_primitive(data_type dt) noexcept
half_float::half float16_t
SPARROW_API std::size_t num_bytes_for_decimal(const char *format)
constexpr bool is_signed_v
float float32_t
constexpr bool is_scalar_v
mpl::append_t< all_base_types_t, char, std::string_view > all_base_types_extended_t
is arrow base type or arrow compound type (list<T>, struct<T> etc.)
date::zoned_time< Duration, TimeZonePtr > timestamp
double float64_t
std::ostream & operator<<(std::ostream &os, const nullval_t &)
std::conditional_t< std::same_as< T, std::string_view >, std::string, T > get_corresponding_arrow_type_t
Template alias to get the corresponding Arrow type for a given type.
typename arrow_traits< T >::default_layout default_layout_t
Binary layout type to use by default for the given C++ representation T of an arrow value.
constexpr data_type data_type_from_size(T={}) noexcept
mpl::typelist< null_type, bool, std::uint8_t, std::int8_t, std::uint16_t, std::int16_t, std::uint32_t, std::int32_t, std::uint64_t, std::int64_t, float16_t, float32_t, float64_t, std::string, std::vector< byte_t >, date_days, date_milliseconds, timestamp< std::chrono::seconds >, timestamp< std::chrono::milliseconds >, timestamp< std::chrono::microseconds >, timestamp< std::chrono::nanoseconds >, zoned_time_without_timezone_seconds, zoned_time_without_timezone_milliseconds, zoned_time_without_timezone_microseconds, zoned_time_without_timezone_nanoseconds, std::chrono::seconds, std::chrono::milliseconds, std::chrono::microseconds, std::chrono::nanoseconds, chrono::months, days_time_interval, month_day_nanoseconds_interval, chrono::time_seconds, chrono::time_milliseconds, chrono::time_microseconds, chrono::time_nanoseconds, list_value, struct_value, map_value, decimal< std::int32_t >, decimal< std::int64_t >, decimal< int128_t >, decimal< int256_t > > all_base_types_t
C++ types value representation types matching Arrow types.
constexpr bool all_digits(const std::string_view s)
std::chrono::time_point< std::chrono::system_clock, std::chrono::milliseconds > date_milliseconds
constexpr bool data_type_is_integer(data_type dt) noexcept
constexpr data_type format_to_data_type(std::string_view format)
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
std::chrono::time_point< std::chrono::system_clock, chrono::days > date_days
constexpr bool is_floating_point_v
Extensions to the C++ standard library.
Provides compile-time information about Arrow data types.
A duration representing time elapsed since midnight, in microseconds.
A duration representing time elapsed since midnight, in milliseconds.
A duration representing time elapsed since midnight, in nanoseconds.
A duration representing time elapsed since midnight.
Metafunction for retrieving the data_type of a typed array.
A sequence of types used for metaprogramming operations.
Definition mp_utils.hpp:123
A zoned time value without timezone, in microseconds.
A zoned time value without timezone, in milliseconds.
A zoned time value without timezone, in nanoseconds.
A zoned time value without timezone, in seconds.