sparrow 0.9.0
Loading...
Searching...
No Matches
data_type.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <chrono>
18#include <version>
19
24
25#if defined(SPARROW_USE_DATE_POLYFILL)
26
27# include <date/tz.h>
28
29# if defined(__cpp_lib_format)
30# include <format>
31
32template <typename T>
33struct std::formatter<date::zoned_time<T>>
34{
35 constexpr auto parse(std::format_parse_context& ctx)
36 {
37 return ctx.begin(); // Simple implementation
38 }
39
40 auto format(const date::zoned_time<T>& date, std::format_context& ctx) const
41 {
42 std::ostringstream oss;
43 oss << date;
44 std::string date_str = oss.str();
45 return std::format_to(ctx.out(), "{}", date_str);
46 }
47};
48# endif
49
50#else
51namespace date = std::chrono;
52#endif
53
54#include <climits>
55#include <concepts>
56#include <cstdint>
57#include <cstring>
58#include <sstream>
59#include <string>
60
66
67
68#if __cplusplus > 202002L and defined(__STDCPP_FLOAT16_T__) and defined(__STDCPP_FLOAT32_T__) \
69 and defined(__STDCPP_FLOAT64_T__)
70# define SPARROW_STD_FIXED_FLOAT_SUPPORT
71#endif
72
73// TODO: use exclusively `std::float16_t etc. once we switch to c++23, see
74// https://en.cppreference.com/w/cpp/types/floating-point
75#if defined(SPARROW_STD_FIXED_FLOAT_SUPPORT)
76# include <stdfloat>
77#else
78// We disable some warnings for the 3rd party float16_t library
79# if defined(__clang__)
80# pragma clang diagnostic push
81# pragma clang diagnostic ignored "-Wconversion"
82# pragma clang diagnostic ignored "-Wsign-conversion"
83# pragma clang diagnostic ignored "-Wold-style-cast"
84# pragma clang diagnostic ignored "-Wdeprecated-declarations"
85# elif defined(__GNUC__)
86# pragma GCC diagnostic push
87# pragma GCC diagnostic ignored "-Wconversion"
88# pragma GCC diagnostic ignored "-Wsign-conversion"
89# pragma GCC diagnostic ignored "-Wold-style-cast"
90# elif defined(_MSC_VER)
91# pragma warning(push)
92# pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned
93 // mismatch
94# pragma warning(disable : 4514) // 'function' : unreferenced inline function has been removed
95# pragma warning(disable : 4668) // 'symbol' is not defined as a preprocessor macro, replacing with
96 // '0' for 'directives'
97# endif
98# include "sparrow/details/3rdparty/float16_t.hpp"
99# if defined(__GNUC__)
100# pragma GCC diagnostic pop
101# elif defined(__clang__)
102# pragma clang diagnostic pop
103# elif defined(_MSC_VER)
104# pragma warning(pop)
105# endif
106#endif
107
108
109namespace sparrow
110{
111
112// TODO: use exclusively `std::float16_t etc. once we switch to c++23, see
113// https://en.cppreference.com/w/cpp/types/floating-point
114#if defined(SPARROW_STD_FIXED_FLOAT_SUPPORT)
115 using float16_t = std::float16_t;
116 using float32_t = std::float32_t;
117 using float64_t = std::float64_t;
118#else
119 using float16_t = numeric::float16_t;
120 using float32_t = float;
121 using float64_t = double;
122#endif
123
124 // P0355R7 (Extending chrono to Calendars and Time Zones) has not been entirely implemented in libc++ yet.
125 // See: https://libcxx.llvm.org/Status/Cxx20.html#note-p0355
126 // For now, we use HowardHinnant/date as a replacement if we are compiling with libc++.
127 // TODO: use the following once libc++ has full support for P0355R7.
128 // using timestamp = std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
129 template <typename Duration, typename TimeZonePtr = const date::time_zone*>
130 using timestamp = date::zoned_time<Duration, TimeZonePtr>;
131
132 // We need to be sure the current target platform is setup to support correctly these types.
133 static_assert(sizeof(float16_t) == 2);
134 static_assert(sizeof(float32_t) == 4);
135 static_assert(sizeof(float64_t) == 8);
136 static_assert(std::is_floating_point_v<float16_t>);
137 static_assert(std::is_floating_point_v<float32_t>);
138 static_assert(std::is_floating_point_v<float64_t>);
139 static_assert(CHAR_BIT == 8);
140
141 using byte_t = std::byte; // For now we will use this to represent raw data TODO: evaluate later if it's
142 // the right choice, switch to char if not
143
145 {
146 };
147
148 inline bool operator==(const null_type&, const null_type&)
149 {
150 return true;
151 }
152
154 // TODO: does not support all types specified by the Arrow specification
155 // yet
212
213 // helper function to check if a string is all digits
214 inline bool all_digits(const std::string_view s)
215 {
216 return !s.empty()
217 && std::find_if(
218 s.begin(),
219 s.end(),
220 [](unsigned char c)
221 {
222 return !std::isdigit(c);
223 }
224 ) == s.end();
225 }
226
227 // get the bit width for decimal value type from format
228 SPARROW_API std::size_t num_bytes_for_decimal(const char* format);
229
232 // TODO: consider returning an optional instead
233 inline data_type format_to_data_type(std::string_view format)
234 {
235 // TODO: add missing conversions from
236 // https://arrow.apache.org/docs/dev/format/CDataInterface.html#data-type-description-format-strings
237 if (format.size() == 1)
238 {
239 switch (format[0])
240 {
241 case 'n':
242 return data_type::NA;
243 case 'b':
244 return data_type::BOOL;
245 case 'C':
246 return data_type::UINT8;
247 case 'c':
248 return data_type::INT8;
249 case 'S':
250 return data_type::UINT16;
251 case 's':
252 return data_type::INT16;
253 case 'I':
254 return data_type::UINT32;
255 case 'i':
256 return data_type::INT32;
257 case 'L':
258 return data_type::UINT64;
259 case 'l':
260 return data_type::INT64;
261 case 'e':
263 case 'f':
264 return data_type::FLOAT;
265 case 'g':
266 return data_type::DOUBLE;
267 case 'u':
268 return data_type::STRING;
269 case 'U':
271 case 'z':
272 return data_type::BINARY;
273 case 'Z':
275 default:
276 return data_type::NA;
277 }
278 }
279 else if (format == "vu") // string view
280 {
282 }
283 else if (format == "vz") // binary view
284 {
286 }
287 // TODO: add propper timestamp support below
288 else if (format.starts_with("t"))
289 {
290 if (format == "tdD")
291 {
293 }
294 else if (format == "tdm")
295 {
297 }
298 else if (format.starts_with("tss:"))
299 {
301 }
302 else if (format.starts_with("tsm:"))
303 {
305 }
306 else if (format.starts_with("tsu:"))
307 {
309 }
310 else if (format.starts_with("tsn:"))
311 {
313 }
314 else if (format == "tDs")
315 {
317 }
318 else if (format == "tDm")
319 {
321 }
322 else if (format == "tDu")
323 {
325 }
326 else if (format == "tDn")
327 {
329 }
330 else if (format == "tiM")
331 {
333 }
334 else if (format == "tiD")
335 {
337 }
338 else if (format == "tin")
339 {
341 }
342 else if (format == "tts")
343 {
345 }
346 else if (format == "ttm")
347 {
349 }
350 else if (format == "ttu")
351 {
353 }
354 else if (format == "ttn")
355 {
357 }
358 }
359 else if (format == "+l")
360 {
361 return data_type::LIST;
362 }
363 else if (format == "+L")
364 {
366 }
367 else if (format == "+vl")
368 {
370 }
371 else if (format == "+vL")
372 {
374 }
375 else if (format.starts_with("+w:"))
376 {
378 }
379 else if (format == "+s")
380 {
381 return data_type::STRUCT;
382 }
383 else if (format == "+m")
384 {
385 return data_type::MAP;
386 }
387 else if (format.starts_with("+ud:"))
388 {
390 }
391 else if (format.starts_with("+us:"))
392 {
394 }
395 else if (format.starts_with("+r"))
396 {
398 }
399 else if (format.starts_with("d:"))
400 {
401 const auto num_bytes = num_bytes_for_decimal(format.data());
402 switch (num_bytes)
403 {
404 case 4:
406 case 8:
408 case 16:
410 case 32:
412 default:
413 throw std::runtime_error("Invalid format for decimal");
414 }
415 }
416 else if (format.starts_with("w:"))
417 {
419 }
420
421 return data_type::NA;
422 }
423
427 template <std::floating_point T>
428 requires(sizeof(T) >= 2 && sizeof(T) <= 8)
430 {
431 // TODO: consider rewriting this to benefit from if constexpr? might not be necessary
432 switch (sizeof(T))
433 {
434 case 2:
436 case 4:
437 return data_type::FLOAT;
438 case 8:
439 return data_type::DOUBLE;
440 }
441
443 }
444
448 template <std::integral T>
449 requires(sizeof(T) >= 1 && sizeof(T) <= 8)
451 {
452 if constexpr (std::same_as<bool, T>)
453 {
454 return data_type::BOOL;
455 }
456 else if constexpr (std::signed_integral<T>)
457 {
458 // TODO: consider rewriting this to benefit from if constexpr? might not be necessary
459 switch (sizeof(T))
460 {
461 case 1:
462 return data_type::INT8;
463 case 2:
464 return data_type::INT16;
465 case 4:
466 return data_type::INT32;
467 case 8:
468 return data_type::INT64;
469 }
470 }
471 else
472 {
473 static_assert(std::unsigned_integral<T>);
474
475 // TODO: consider rewriting this to benefit from if constexpr? might not be necessary
476 switch (sizeof(T))
477 {
478 case 1:
479 return data_type::UINT8;
480 case 2:
481 return data_type::UINT16;
482 case 4:
483 return data_type::UINT32;
484 case 8:
485 return data_type::UINT64;
486 }
487 }
488
490 }
491
492 // REMARK: this functions is non-applicable for the following types
493 // - all decimal types because further information is needed (precision, scale)
494 // - fixed-sized binary because further information is needed (element size)
498 constexpr std::string_view data_type_to_format(data_type type)
499 {
500 switch (type)
501 {
502 case data_type::NA:
503 return "n";
504 case data_type::BOOL:
505 return "b";
506 case data_type::UINT8:
507 return "C";
508 case data_type::INT8:
509 return "c";
511 return "S";
512 case data_type::INT16:
513 return "s";
515 return "I";
516 case data_type::INT32:
517 return "i";
519 return "L";
520 case data_type::INT64:
521 return "l";
523 return "e";
524 case data_type::FLOAT:
525 return "f";
527 return "g";
529 return "u";
531 return "U";
533 return "z";
535 return "Z";
537 return "tdD";
539 return "tdm";
541 return "tss:";
543 return "tsm:";
545 return "tsu:";
547 return "tsn:";
549 return "tDs";
551 return "tDm";
553 return "tDu";
555 return "tDn";
557 return "tiM";
559 return "tiD";
561 return "tin";
563 return "tts";
565 return "ttm";
567 return "ttu";
569 return "ttn";
570 case data_type::LIST:
571 return "+l";
573 return "+L";
574 default:
575 // TODO: add missing types
576 throw std::runtime_error("Unsupported data type");
577 }
578 }
579
582 {
583 switch (dt)
584 {
585 case data_type::BOOL:
586 case data_type::UINT8:
587 case data_type::INT8:
589 case data_type::INT16:
591 case data_type::INT32:
593 case data_type::INT64:
595 case data_type::FLOAT:
597 return true;
598 default:
599 return false;
600 }
601 }
602
605 {
606 switch (dt)
607 {
608 case data_type::UINT8:
609 case data_type::INT8:
611 case data_type::INT16:
613 case data_type::INT32:
615 case data_type::INT64:
616 return true;
617 default:
618 return false;
619 }
620 }
621
622 class list_value;
623 class struct_value;
624
626 // NOTE: this needs to be in sync-order with `data_type`
628 null_type,
629 bool,
630 std::uint8_t,
631 std::int8_t,
632 std::uint16_t,
633 std::int16_t,
634 std::uint32_t,
635 std::int32_t,
636 std::uint64_t,
637 std::int64_t,
638 float16_t,
639 float32_t,
640 float64_t,
641 std::string,
642 std::vector<byte_t>,
643 date_days,
653 std::chrono::seconds,
654 std::chrono::milliseconds,
655 std::chrono::microseconds,
656 std::chrono::nanoseconds,
664 // TODO: add missing fundamental types here
671
675
677 template <class T>
679
681 // template <class T>
682 // concept is_arrow_base_type_or_compound = is_arrow_base_type<T> || is_list_value_v<T>;
684
688
697 template <class T>
699
708 template <class T>
709 using get_corresponding_arrow_type_t = std::conditional_t<std::same_as<T, std::string_view>, std::string, T>;
710
724 template <class T>
726
727 namespace detail
728 {
729 template <template <class> class>
731 {
732 };
733 }
738 template <class T>
742 requires std::same_as<std::remove_cvref_t<decltype(T::type_id)>, ::sparrow::data_type>;
743
745 typename T::value_type;
746
748 // typename detail::accepts_template<T::template default_layout>;
749
750 // TODO: add more interface requirements on the traits here
751 // TODO: add conversion operations between bytes and the value type
752 };
753
754
757 template <class T>
758 concept has_arrow_type_traits = requires { typename ::sparrow::arrow_traits<T>; }
760
763 template <class T>
765
768 template <has_arrow_type_traits T>
769 constexpr auto arrow_type_id() -> data_type
770 {
772 }
773
776 template <has_arrow_type_traits T>
777 constexpr auto arrow_type_id(const T&) -> data_type
778 {
779 return arrow_type_id<T>();
780 }
781
784 template <has_arrow_type_traits T>
785 constexpr std::string_view data_type_format_of()
786 {
788 }
789
791 template <has_arrow_type_traits T>
793
794 // For now, a tiny wrapper around data_type
795 // TODO: More data and functions to come
797 {
798 public:
799
800 constexpr data_descriptor()
802 {
803 }
804
805 data_descriptor(std::string_view format)
807 {
808 }
809
810 constexpr explicit data_descriptor(data_type id)
811 : m_id(id)
812 {
813 }
814
815 constexpr data_type id() const
816 {
817 return m_id;
818 }
819
820 private:
821
822 data_type m_id;
823 };
824
825 namespace impl
826 {
827 template <class C, bool is_const>
829 : std::conditional<is_const, typename C::inner_const_reference, typename C::inner_reference>
830 {
831 };
832
833 template <class C, bool is_const>
835 } // namespace impl
836
837 template <class T>
838 concept layout_offset = std::same_as<T, std::int32_t> || std::same_as<T, std::int64_t>;
839}
840
841#if defined(__cpp_lib_format)
842
843namespace std
844{
845 template <>
846 struct formatter<sparrow::data_type>
847 {
848 constexpr auto parse(std::format_parse_context& ctx)
849 {
850 return ctx.begin(); // Simple implementation
851 }
852
853 auto format(const sparrow::data_type& data_type, std::format_context& ctx) const
854 {
855 static const auto get_enum_name = [](sparrow::data_type dt) -> std::string_view
856 {
857 using enum sparrow::data_type;
858 switch (dt)
859 {
860 case NA:
861 return "N/A";
862 case BOOL:
863 return "bool";
864 case UINT8:
865 return "uint8";
866 case INT8:
867 return "int8";
868 case UINT16:
869 return "uint16";
870 case INT16:
871 return "int16";
872 case UINT32:
873 return "uint32";
874 case INT32:
875 return "int32";
876 case UINT64:
877 return "uint64";
878 case INT64:
879 return "int64";
880 case HALF_FLOAT:
881 return "float16";
882 case FLOAT:
883 return "float32";
884 case DOUBLE:
885 return "double";
886 case STRING:
887 return "String";
888 case LARGE_STRING:
889 return "Large string";
890 case BINARY:
891 return "Binary";
892 case LARGE_BINARY:
893 return "Large binary";
894 case DATE_DAYS:
895 return "Date days";
897 return "Date milliseconds";
899 return "Timestamp seconds";
901 return "Timestamp milliseconds";
903 return "Timestamp microseconds";
905 return "Timestamp nanoseconds";
906 case DURATION_SECONDS:
907 return "Duration seconds";
909 return "Duration milliseconds";
911 return "Duration microseconds";
913 return "Duration nanoseconds";
914 case INTERVAL_MONTHS:
915 return "Interval months";
917 return "Interval days time";
919 return "Interval months days nanoseconds";
920 case TIME_SECONDS:
921 return "Time seconds";
923 return "Time milliseconds";
925 return "Time microseconds";
926 case TIME_NANOSECONDS:
927 return "Time nanoseconds";
928 case LIST:
929 return "List";
930 case LARGE_LIST:
931 return "Large list";
932 case LIST_VIEW:
933 return "List view";
934 case LARGE_LIST_VIEW:
935 return "Large list view";
936 case FIXED_SIZED_LIST:
937 return "Fixed sized list";
938 case STRUCT:
939 return "Struct";
940 case MAP:
941 return "Map";
942 case DENSE_UNION:
943 return "Dense union";
944 case SPARSE_UNION:
945 return "Sparse union";
946 case RUN_ENCODED:
947 return "Run encoded";
948 case DECIMAL32:
949 return "Decimal32";
950 case DECIMAL64:
951 return "Decimal64";
952 case DECIMAL128:
953 return "Decimal128";
954 case DECIMAL256:
955 return "Decimal256";
957 return "Fixed width binary";
958 case STRING_VIEW:
959 return "String view";
960 case BINARY_VIEW:
961 return "Binary view";
962 };
963 return "UNKNOWN";
964 };
965
966 return std::format_to(ctx.out(), "{}", get_enum_name(data_type));
967 }
968 };
969}
970
971template <>
972struct std::formatter<sparrow::null_type>
973{
974 constexpr auto parse(std::format_parse_context& ctx)
975 {
976 return ctx.begin(); // Simple implementation
977 }
978
979 auto format(const sparrow::null_type&, std::format_context& ctx) const
980 {
981 return std::format_to(ctx.out(), "null_type");
982 }
983};
984
985inline std::ostream& operator<<(std::ostream& os, const sparrow::null_type&)
986{
987 os << std::format("{}", "null");
988 return os;
989}
990
991template <>
992struct std::formatter<std::byte>
993{
994 constexpr auto parse(std::format_parse_context& ctx)
995 {
996 return ctx.begin(); // Simple implementation
997 }
998
999 auto format(const std::byte& b, std::format_context& ctx) const
1000 {
1001 return std::format_to(ctx.out(), "{}", static_cast<int>(b));
1002 }
1003};
1004
1005#endif
constexpr data_descriptor(data_type id)
data_descriptor(std::string_view format)
constexpr data_type id() const
Matches any type which is one of the base C++ types supported or at least that provides an arrow_trai...
Matches types providing valid and complete arrow_traits specialization.
Checks if a type is an extended base type for Arrow.
Matches C++ representation types which are supported by default.
Matches valid and complete arrow_traits specializations for type T.
#define SPARROW_API
Definition config.hpp:38
std::chrono::duration< int32_t, std::ratio< 2629746 > > months
typename get_inner_reference< C, is_const >::type get_inner_reference_t
consteval bool contains(L list)
Definition mp_utils.hpp:285
decltype(append(TypeList{}, Us{}...)) append_t
Appends one or more types or typelist to a given TypeList.
Definition mp_utils.hpp:103
constexpr bool is_type_instance_of_v
true if T is a concrete type template instanciation of U which is a type template.
Definition mp_utils.hpp:50
void unreachable()
Invokes undefined behavior.
Definition mp_utils.hpp:425
std::byte byte_t
static constexpr all_base_types_extended_t all_base_types_extended
Type list of every C++ representation types supported by default, in order matching data_type related...
constexpr std::string_view data_type_format_of()
constexpr std::string_view data_type_to_format(data_type type)
mpl::typelist< null_type, bool, std::uint8_t, std::int8_t, std::uint16_t, std::int16_t, std::uint32_t, std::int32_t, std::uint64_t, std::int64_t, float16_t, float32_t, float64_t, std::string, std::vector< byte_t >, date_days, date_milliseconds, timestamp< std::chrono::seconds >, timestamp< std::chrono::milliseconds >, timestamp< std::chrono::microseconds >, timestamp< std::chrono::nanoseconds >, zoned_time_without_timezone_seconds, zoned_time_without_timezone_milliseconds, zoned_time_without_timezone_microseconds, zoned_time_without_timezone_nanoseconds, std::chrono::seconds, std::chrono::milliseconds, std::chrono::microseconds, std::chrono::nanoseconds, chrono::months, days_time_interval, month_day_nanoseconds_interval, chrono::time_seconds, chrono::time_milliseconds, chrono::time_microseconds, chrono::time_nanoseconds, list_value, struct_value, decimal< std::int32_t >, decimal< std::int64_t >, decimal< int128_t >, decimal< int256_t > > all_base_types_t
C++ types value representation types matching Arrow types.
constexpr bool data_type_is_primitive(data_type dt)
bool all_digits(const std::string_view s)
SPARROW_API bool operator==(const array &lhs, const array &rhs)
Compares the content of two arrays.
SPARROW_API std::size_t num_bytes_for_decimal(const char *format)
float float32_t
static constexpr all_base_types_t all_base_types
Type list of every C++ representation types supported by default, in order matching data_type related...
constexpr data_type data_type_from_size(T={})
data_type format_to_data_type(std::string_view format)
mpl::append_t< all_base_types_t, char, std::string_view > all_base_types_extended_t
is arrow base type or arrow compound type (list<T>, struct<T> etc.)
date::zoned_time< Duration, TimeZonePtr > timestamp
double float64_t
constexpr bool data_type_is_integer(data_type dt)
std::conditional_t< std::same_as< T, std::string_view >, std::string, T > get_corresponding_arrow_type_t
Template alias to get the corresponding Arrow type for a given type.
typename arrow_traits< T >::default_layout default_layout_t
Binary layout type to use by default for the given C++ representation T of an arrow value.
numeric::float16_t float16_t
std::chrono::time_point< std::chrono::system_clock, std::chrono::milliseconds > date_milliseconds
constexpr auto arrow_type_id() -> data_type
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
std::chrono::time_point< std::chrono::system_clock, chrono::days > date_days
std::ostream & operator<<(std::ostream &os, const sparrow::nullval_t &)
Definition nullable.hpp:933
Provides compile-time information about Arrow data types.
A duration representing time elapsed since midnight, in microseconds.
A duration representing time elapsed since midnight, in milliseconds.
A duration representing time elapsed since midnight, in nanoseconds.
A duration representing time elapsed since midnight.
A sequence of types, used for meta-programming operations.
Definition mp_utils.hpp:55
A zoned time value without timezone, in microseconds.
A zoned time value without timezone, in milliseconds.
A zoned time value without timezone, in nanoseconds.
A zoned time value without timezone, in seconds.