sparrow 0.3.0
Loading...
Searching...
No Matches
data_type.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <chrono>
18#include <version>
19
23
24#if defined(SPARROW_USE_DATE_POLYFILL)
25
26# include <date/tz.h>
27
28# if defined(__cpp_lib_format)
29# include <format>
30
31template <typename T>
32struct std::formatter<date::zoned_time<T>>
33{
34 constexpr auto parse(std::format_parse_context& ctx)
35 {
36 return ctx.begin(); // Simple implementation
37 }
38
39 auto format(const date::zoned_time<T>& date, std::format_context& ctx) const
40 {
41 std::ostringstream oss;
42 oss << date;
43 std::string date_str = oss.str();
44 return std::format_to(ctx.out(), "{}", date_str);
45 }
46};
47# endif
48
49#else
50namespace date = std::chrono;
51#endif
52
53#include <climits>
54#include <concepts>
55#include <cstdint>
56#include <cstring>
57#include <sstream>
58#include <string>
59
65
66
67#if __cplusplus > 202002L and defined(__STDCPP_FLOAT16_T__) and defined(__STDCPP_FLOAT32_T__) \
68 and defined(__STDCPP_FLOAT64_T__)
69# define SPARROW_STD_FIXED_FLOAT_SUPPORT
70#endif
71
72// TODO: use exclusively `std::float16_t etc. once we switch to c++23, see
73// https://en.cppreference.com/w/cpp/types/floating-point
74#if defined(SPARROW_STD_FIXED_FLOAT_SUPPORT)
75# include <stdfloat>
76#else
77// We disable some warnings for the 3rd party float16_t library
78# if defined(__clang__)
79# pragma clang diagnostic push
80# pragma clang diagnostic ignored "-Wconversion"
81# pragma clang diagnostic ignored "-Wsign-conversion"
82# pragma clang diagnostic ignored "-Wold-style-cast"
83# pragma clang diagnostic ignored "-Wdeprecated-declarations"
84# elif defined(__GNUC__)
85# pragma GCC diagnostic push
86# pragma GCC diagnostic ignored "-Wconversion"
87# pragma GCC diagnostic ignored "-Wsign-conversion"
88# pragma GCC diagnostic ignored "-Wold-style-cast"
89# elif defined(_MSC_VER)
90# pragma warning(push)
91# pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned
92 // mismatch
93# pragma warning(disable : 4514) // 'function' : unreferenced inline function has been removed
94# pragma warning(disable : 4668) // 'symbol' is not defined as a preprocessor macro, replacing with
95 // '0' for 'directives'
96# endif
97# include "sparrow/details/3rdparty/float16_t.hpp"
98# if defined(__GNUC__)
99# pragma GCC diagnostic pop
100# elif defined(__clang__)
101# pragma clang diagnostic pop
102# elif defined(_MSC_VER)
103# pragma warning(pop)
104# endif
105#endif
106
107
108namespace sparrow
109{
110
111// TODO: use exclusively `std::float16_t etc. once we switch to c++23, see
112// https://en.cppreference.com/w/cpp/types/floating-point
113#if defined(SPARROW_STD_FIXED_FLOAT_SUPPORT)
114 using float16_t = std::float16_t;
115 using float32_t = std::float32_t;
116 using float64_t = std::float64_t;
117#else
118 using float16_t = numeric::float16_t;
119 using float32_t = float;
120 using float64_t = double;
121#endif
122
123 // P0355R7 (Extending chrono to Calendars and Time Zones) has not been entirely implemented in libc++ yet.
124 // See: https://libcxx.llvm.org/Status/Cxx20.html#note-p0355
125 // For now, we use HowardHinnant/date as a replacement if we are compiling with libc++.
126 // TODO: use the following once libc++ has full support for P0355R7.
127 // using timestamp = std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
128 template <typename Duration, typename TimeZonePtr = const date::time_zone*>
129 using timestamp = date::zoned_time<Duration, TimeZonePtr>;
130
131 // We need to be sure the current target platform is setup to support correctly these types.
132 static_assert(sizeof(float16_t) == 2);
133 static_assert(sizeof(float32_t) == 4);
134 static_assert(sizeof(float64_t) == 8);
135 static_assert(std::is_floating_point_v<float16_t>);
136 static_assert(std::is_floating_point_v<float32_t>);
137 static_assert(std::is_floating_point_v<float64_t>);
138 static_assert(CHAR_BIT == 8);
139
140 using byte_t = std::byte; // For now we will use this to represent raw data TODO: evaluate later if it's
141 // the right choice, switch to char if not
142
144 {
145 };
146
147 inline bool operator==(const null_type&, const null_type&)
148 {
149 return true;
150 }
151
153 // TODO: does not support all types specified by the Arrow specification
154 // yet
211
212 // helper function to check if a string is all digits
213 inline bool all_digits(const std::string_view s)
214 {
215 return !s.empty()
216 && std::find_if(
217 s.begin(),
218 s.end(),
219 [](unsigned char c)
220 {
221 return !std::isdigit(c);
222 }
223 ) == s.end();
224 }
225
226 // get the bit width for decimal value type from format
227 SPARROW_API std::size_t num_bytes_for_decimal(const char* format);
228
231 // TODO: consider returning an optional instead
232 inline data_type format_to_data_type(std::string_view format)
233 {
234 // TODO: add missing conversions from
235 // https://arrow.apache.org/docs/dev/format/CDataInterface.html#data-type-description-format-strings
236 if (format.size() == 1)
237 {
238 switch (format[0])
239 {
240 case 'n':
241 return data_type::NA;
242 case 'b':
243 return data_type::BOOL;
244 case 'C':
245 return data_type::UINT8;
246 case 'c':
247 return data_type::INT8;
248 case 'S':
249 return data_type::UINT16;
250 case 's':
251 return data_type::INT16;
252 case 'I':
253 return data_type::UINT32;
254 case 'i':
255 return data_type::INT32;
256 case 'L':
257 return data_type::UINT64;
258 case 'l':
259 return data_type::INT64;
260 case 'e':
262 case 'f':
263 return data_type::FLOAT;
264 case 'g':
265 return data_type::DOUBLE;
266 case 'u':
267 return data_type::STRING;
268 case 'U':
270 case 'z':
271 return data_type::BINARY;
272 case 'Z':
274 default:
275 return data_type::NA;
276 }
277 }
278 else if (format == "vu") // string view
279 {
281 }
282 else if (format == "vz") // binary view
283 {
285 }
286 // TODO: add propper timestamp support below
287 else if (format.starts_with("t"))
288 {
289 if (format == "tdD")
290 {
292 }
293 else if (format == "tdm")
294 {
296 }
297 else if (format.starts_with("tss:"))
298 {
300 }
301 else if (format.starts_with("tsm:"))
302 {
304 }
305 else if (format.starts_with("tsu:"))
306 {
308 }
309 else if (format.starts_with("tsn:"))
310 {
312 }
313 else if (format == "tDs")
314 {
316 }
317 else if (format == "tDm")
318 {
320 }
321 else if (format == "tDu")
322 {
324 }
325 else if (format == "tDn")
326 {
328 }
329 else if (format == "tiM")
330 {
332 }
333 else if (format == "tiD")
334 {
336 }
337 else if (format == "tin")
338 {
340 }
341 else if (format == "tts")
342 {
344 }
345 else if (format == "ttm")
346 {
348 }
349 else if (format == "ttu")
350 {
352 }
353 else if (format == "ttn")
354 {
356 }
357 }
358 else if (format == "+l")
359 {
360 return data_type::LIST;
361 }
362 else if (format == "+L")
363 {
365 }
366 else if (format == "+vl")
367 {
369 }
370 else if (format == "+vL")
371 {
373 }
374 else if (format.starts_with("+w:"))
375 {
377 }
378 else if (format == "+s")
379 {
380 return data_type::STRUCT;
381 }
382 else if (format == "+m")
383 {
384 return data_type::MAP;
385 }
386 else if (format.starts_with("+ud:"))
387 {
389 }
390 else if (format.starts_with("+us:"))
391 {
393 }
394 else if (format.starts_with("+r"))
395 {
397 }
398 else if (format.starts_with("d:"))
399 {
400 const auto num_bytes = num_bytes_for_decimal(format.data());
401 switch (num_bytes)
402 {
403 case 4:
405 case 8:
407 case 16:
409 case 32:
411 default:
412 throw std::runtime_error("Invalid format for decimal");
413 }
414 }
415 else if (format.starts_with("w:"))
416 {
418 }
419
420 return data_type::NA;
421 }
422
426 template <std::floating_point T>
427 requires(sizeof(T) >= 2 && sizeof(T) <= 8)
429 {
430 // TODO: consider rewriting this to benefit from if constexpr? might not be necessary
431 switch (sizeof(T))
432 {
433 case 2:
435 case 4:
436 return data_type::FLOAT;
437 case 8:
438 return data_type::DOUBLE;
439 }
440
442 }
443
447 template <std::integral T>
448 requires(sizeof(T) >= 1 && sizeof(T) <= 8)
450 {
451 if constexpr (std::same_as<bool, T>)
452 {
453 return data_type::BOOL;
454 }
455 else if constexpr (std::signed_integral<T>)
456 {
457 // TODO: consider rewriting this to benefit from if constexpr? might not be necessary
458 switch (sizeof(T))
459 {
460 case 1:
461 return data_type::INT8;
462 case 2:
463 return data_type::INT16;
464 case 4:
465 return data_type::INT32;
466 case 8:
467 return data_type::INT64;
468 }
469 }
470 else
471 {
472 static_assert(std::unsigned_integral<T>);
473
474 // TODO: consider rewriting this to benefit from if constexpr? might not be necessary
475 switch (sizeof(T))
476 {
477 case 1:
478 return data_type::UINT8;
479 case 2:
480 return data_type::UINT16;
481 case 4:
482 return data_type::UINT32;
483 case 8:
484 return data_type::UINT64;
485 }
486 }
487
489 }
490
491 // REMARK: this functions is non-applicable for the following types
492 // - all decimal types because further information is needed (precision, scale)
493 // - fixed-sized binary because further information is needed (element size)
497 constexpr std::string_view data_type_to_format(data_type type)
498 {
499 switch (type)
500 {
501 case data_type::NA:
502 return "n";
503 case data_type::BOOL:
504 return "b";
505 case data_type::UINT8:
506 return "C";
507 case data_type::INT8:
508 return "c";
510 return "S";
511 case data_type::INT16:
512 return "s";
514 return "I";
515 case data_type::INT32:
516 return "i";
518 return "L";
519 case data_type::INT64:
520 return "l";
522 return "e";
523 case data_type::FLOAT:
524 return "f";
526 return "g";
528 return "u";
530 return "U";
532 return "z";
534 return "Z";
536 return "tdD";
538 return "tdm";
540 return "tss:";
542 return "tsm:";
544 return "tsu:";
546 return "tsn:";
548 return "tDs";
550 return "tDm";
552 return "tDu";
554 return "tDn";
556 return "tiM";
558 return "tiD";
560 return "tin";
562 return "tts";
564 return "ttm";
566 return "ttu";
568 return "ttn";
569 case data_type::LIST:
570 return "+l";
572 return "+L";
573 default:
574 // TODO: add missing types
575 throw std::runtime_error("Unsupported data type");
576 }
577 }
578
581 {
582 switch (dt)
583 {
584 case data_type::BOOL:
585 case data_type::UINT8:
586 case data_type::INT8:
588 case data_type::INT16:
590 case data_type::INT32:
592 case data_type::INT64:
594 case data_type::FLOAT:
596 return true;
597 default:
598 return false;
599 }
600 }
601
604 {
605 switch (dt)
606 {
607 case data_type::UINT8:
608 case data_type::INT8:
610 case data_type::INT16:
612 case data_type::INT32:
614 case data_type::INT64:
615 return true;
616 default:
617 return false;
618 }
619 }
620
621 class list_value;
622 class struct_value;
623
625 // NOTE: this needs to be in sync-order with `data_type`
627 null_type,
628 bool,
629 std::uint8_t,
630 std::int8_t,
631 std::uint16_t,
632 std::int16_t,
633 std::uint32_t,
634 std::int32_t,
635 std::uint64_t,
636 std::int64_t,
637 float16_t,
638 float32_t,
639 float64_t,
640 std::string,
641 std::vector<byte_t>,
642 date_days,
648 std::chrono::seconds,
649 std::chrono::milliseconds,
650 std::chrono::microseconds,
651 std::chrono::nanoseconds,
659 // TODO: add missing fundamental types here
666
670
672 template <class T>
674
676 // template <class T>
677 // concept is_arrow_base_type_or_compound = is_arrow_base_type<T> || is_list_value_v<T>;
679
683
692 template <class T>
694
703 template <class T>
704 using get_corresponding_arrow_type_t = std::conditional_t<std::same_as<T, std::string_view>, std::string, T>;
705
719 template <class T>
721
722 namespace detail
723 {
724 template <template <class> class>
726 {
727 };
728 }
733 template <class T>
737 requires std::same_as<std::remove_cvref_t<decltype(T::type_id)>, ::sparrow::data_type>;
738
740 typename T::value_type;
741
743 // typename detail::accepts_template<T::template default_layout>;
744
745 // TODO: add more interface requirements on the traits here
746 // TODO: add conversion operations between bytes and the value type
747 };
748
749
752 template <class T>
753 concept has_arrow_type_traits = requires { typename ::sparrow::arrow_traits<T>; }
755
758 template <class T>
760
763 template <has_arrow_type_traits T>
764 constexpr auto arrow_type_id() -> data_type
765 {
767 }
768
771 template <has_arrow_type_traits T>
772 constexpr auto arrow_type_id(const T&) -> data_type
773 {
774 return arrow_type_id<T>();
775 }
776
779 template <has_arrow_type_traits T>
780 constexpr std::string_view data_type_format_of()
781 {
783 }
784
786 template <has_arrow_type_traits T>
788
789 // For now, a tiny wrapper around data_type
790 // TODO: More data and functions to come
792 {
793 public:
794
795 constexpr data_descriptor()
797 {
798 }
799
800 data_descriptor(std::string_view format)
802 {
803 }
804
805 constexpr explicit data_descriptor(data_type id)
806 : m_id(id)
807 {
808 }
809
810 constexpr data_type id() const
811 {
812 return m_id;
813 }
814
815 private:
816
817 data_type m_id;
818 };
819
820 namespace impl
821 {
822 template <class C, bool is_const>
824 : std::conditional<is_const, typename C::inner_const_reference, typename C::inner_reference>
825 {
826 };
827
828 template <class C, bool is_const>
830 } // namespace impl
831
832 template <class T>
833 concept layout_offset = std::same_as<T, std::int32_t> || std::same_as<T, std::int64_t>;
834}
835
836#if defined(__cpp_lib_format)
837
838namespace std
839{
840 template <>
841 struct formatter<sparrow::data_type>
842 {
843 constexpr auto parse(std::format_parse_context& ctx)
844 {
845 return ctx.begin(); // Simple implementation
846 }
847
848 auto format(const sparrow::data_type& data_type, std::format_context& ctx) const
849 {
850 static const auto get_enum_name = [](sparrow::data_type dt) -> std::string_view
851 {
852 using enum sparrow::data_type;
853 switch (dt)
854 {
855 case NA:
856 return "N/A";
857 case BOOL:
858 return "bool";
859 case UINT8:
860 return "uint8";
861 case INT8:
862 return "int8";
863 case UINT16:
864 return "uint16";
865 case INT16:
866 return "int16";
867 case UINT32:
868 return "uint32";
869 case INT32:
870 return "int32";
871 case UINT64:
872 return "uint64";
873 case INT64:
874 return "int64";
875 case HALF_FLOAT:
876 return "float16";
877 case FLOAT:
878 return "float32";
879 case DOUBLE:
880 return "double";
881 case STRING:
882 return "String";
883 case LARGE_STRING:
884 return "Large string";
885 case BINARY:
886 return "Binary";
887 case LARGE_BINARY:
888 return "Large binary";
889 case DATE_DAYS:
890 return "Date days";
892 return "Date milliseconds";
894 return "Timestamp seconds";
896 return "Timestamp milliseconds";
898 return "Timestamp microseconds";
900 return "Timestamp nanoseconds";
901 case DURATION_SECONDS:
902 return "Duration seconds";
904 return "Duration milliseconds";
906 return "Duration microseconds";
908 return "Duration nanoseconds";
909 case INTERVAL_MONTHS:
910 return "Interval months";
912 return "Interval days time";
914 return "Interval months days nanoseconds";
915 case TIME_SECONDS:
916 return "Time seconds";
918 return "Time milliseconds";
920 return "Time microseconds";
921 case TIME_NANOSECONDS:
922 return "Time nanoseconds";
923 case LIST:
924 return "List";
925 case LARGE_LIST:
926 return "Large list";
927 case LIST_VIEW:
928 return "List view";
929 case LARGE_LIST_VIEW:
930 return "Large list view";
931 case FIXED_SIZED_LIST:
932 return "Fixed sized list";
933 case STRUCT:
934 return "Struct";
935 case MAP:
936 return "Map";
937 case DENSE_UNION:
938 return "Dense union";
939 case SPARSE_UNION:
940 return "Sparse union";
941 case RUN_ENCODED:
942 return "Run encoded";
943 case DECIMAL32:
944 return "Decimal32";
945 case DECIMAL64:
946 return "Decimal64";
947 case DECIMAL128:
948 return "Decimal128";
949 case DECIMAL256:
950 return "Decimal256";
952 return "Fixed width binary";
953 case STRING_VIEW:
954 return "String view";
955 case BINARY_VIEW:
956 return "Binary view";
957 };
958 return "UNKNOWN";
959 };
960
961 return std::format_to(ctx.out(), "{}", get_enum_name(data_type));
962 }
963 };
964
965 template <>
966 struct formatter<sparrow::null_type>
967 {
968 constexpr auto parse(std::format_parse_context& ctx)
969 {
970 return ctx.begin(); // Simple implementation
971 }
972
973 auto format(const sparrow::null_type&, std::format_context& ctx) const
974 {
975 return std::format_to(ctx.out(), "null_type");
976 }
977 };
978
979 template <>
980 struct formatter<std::byte>
981 {
982 constexpr auto parse(std::format_parse_context& ctx)
983 {
984 return ctx.begin(); // Simple implementation
985 }
986
987 auto format(const std::byte& b, std::format_context& ctx) const
988 {
989 return std::format_to(ctx.out(), "{}", static_cast<int>(b));
990 }
991 };
992}
993
994#endif
constexpr data_descriptor(data_type id)
data_descriptor(std::string_view format)
constexpr data_type id() const
Matches any type which is one of the base C++ types supported or at least that provides an arrow_trai...
Matches types providing valid and complete arrow_traits specialization.
Checks if a type is an extended base type for Arrow.
Matches C++ representation types which are supported by default.
Matches valid and complete arrow_traits specializations for type T.
#define SPARROW_API
Definition config.hpp:38
std::chrono::duration< int32_t, std::ratio< 2629746 > > months
typename get_inner_reference< C, is_const >::type get_inner_reference_t
consteval bool contains(L list)
Definition mp_utils.hpp:285
decltype(append(TypeList{}, Us{}...)) append_t
Appends one or more types or typelist to a given TypeList.
Definition mp_utils.hpp:103
constexpr bool is_type_instance_of_v
true if T is a concrete type template instanciation of U which is a type template.
Definition mp_utils.hpp:50
void unreachable()
Invokes undefined behavior.
Definition mp_utils.hpp:425
std::byte byte_t
static constexpr all_base_types_extended_t all_base_types_extended
Type list of every C++ representation types supported by default, in order matching data_type related...
constexpr std::string_view data_type_format_of()
constexpr std::string_view data_type_to_format(data_type type)
constexpr bool data_type_is_primitive(data_type dt)
bool all_digits(const std::string_view s)
SPARROW_API bool operator==(const array &lhs, const array &rhs)
Compares the content of two arrays.
SPARROW_API std::size_t num_bytes_for_decimal(const char *format)
float float32_t
static constexpr all_base_types_t all_base_types
Type list of every C++ representation types supported by default, in order matching data_type related...
mpl::typelist< null_type, bool, std::uint8_t, std::int8_t, std::uint16_t, std::int16_t, std::uint32_t, std::int32_t, std::uint64_t, std::int64_t, float16_t, float32_t, float64_t, std::string, std::vector< byte_t >, date_days, date_milliseconds, timestamp< std::chrono::seconds >, timestamp< std::chrono::milliseconds >, timestamp< std::chrono::microseconds >, timestamp< std::chrono::nanoseconds >, std::chrono::seconds, std::chrono::milliseconds, std::chrono::microseconds, std::chrono::nanoseconds, chrono::months, days_time_interval, month_day_nanoseconds_interval, chrono::time_seconds, chrono::time_milliseconds, chrono::time_microseconds, chrono::time_nanoseconds, list_value, struct_value, decimal< std::int32_t >, decimal< std::int64_t >, decimal< int128_t >, decimal< int256_t > > all_base_types_t
C++ types value representation types matching Arrow types.
constexpr data_type data_type_from_size(T={})
data_type format_to_data_type(std::string_view format)
mpl::append_t< all_base_types_t, char, std::string_view > all_base_types_extended_t
is arrow base type or arrow compound type (list<T>, struct<T> etc.)
date::zoned_time< Duration, TimeZonePtr > timestamp
double float64_t
constexpr bool data_type_is_integer(data_type dt)
std::conditional_t< std::same_as< T, std::string_view >, std::string, T > get_corresponding_arrow_type_t
Template alias to get the corresponding Arrow type for a given type.
typename arrow_traits< T >::default_layout default_layout_t
Binary layout type to use by default for the given C++ representation T of an arrow value.
numeric::float16_t float16_t
std::chrono::time_point< std::chrono::system_clock, std::chrono::milliseconds > date_milliseconds
constexpr auto arrow_type_id() -> data_type
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
std::chrono::time_point< std::chrono::system_clock, chrono::days > date_days
Provides compile-time information about Arrow data types.
A duration representing time elapsed since midnight, in microseconds.
A duration representing time elapsed since midnight, in milliseconds.
A duration representing time elapsed since midnight, in nanoseconds.
A duration representing time elapsed since midnight.
A sequence of types, used for meta-programming operations.
Definition mp_utils.hpp:55