sparrow 0.9.0
Loading...
Searching...
No Matches
data_type.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <chrono>
18#include <version>
19
25
26#if defined(SPARROW_USE_DATE_POLYFILL)
27
28# include <date/tz.h>
29
30# if defined(__cpp_lib_format)
31# include <format>
32
33template <typename T>
34struct std::formatter<date::zoned_time<T>>
35{
36 constexpr auto parse(std::format_parse_context& ctx)
37 {
38 return ctx.begin(); // Simple implementation
39 }
40
41 auto format(const date::zoned_time<T>& date, std::format_context& ctx) const
42 {
43 std::ostringstream oss;
44 oss << date;
45 std::string date_str = oss.str();
46 return std::format_to(ctx.out(), "{}", date_str);
47 }
48};
49# endif
50
51#else
52namespace date = std::chrono;
53#endif
54
55#include <climits>
56#include <concepts>
57#include <cstdint>
58#include <cstring>
59#include <sstream>
60#include <string>
61
67
68
69#if __cplusplus > 202002L and defined(__STDCPP_FLOAT16_T__) and defined(__STDCPP_FLOAT32_T__) \
70 and defined(__STDCPP_FLOAT64_T__)
71# define SPARROW_STD_FIXED_FLOAT_SUPPORT
72#endif
73
74// TODO: use exclusively `std::float16_t etc. once we switch to c++23, see
75// https://en.cppreference.com/w/cpp/types/floating-point
76#if defined(SPARROW_STD_FIXED_FLOAT_SUPPORT)
77# include <stdfloat>
78#else
79# include "sparrow/details/3rdparty/float16_t.hpp"
80#endif
81
82
83namespace sparrow
84{
85
86// TODO: use exclusively `std::float16_t etc. once we switch to c++23, see
87// https://en.cppreference.com/w/cpp/types/floating-point
88#if defined(SPARROW_STD_FIXED_FLOAT_SUPPORT)
89 using float16_t = std::float16_t;
90 using float32_t = std::float32_t;
91 using float64_t = std::float64_t;
92#else
93 using float16_t = half_float::half;
94 using float32_t = float;
95 using float64_t = double;
96#endif
97
98 // P0355R7 (Extending chrono to Calendars and Time Zones) has not been entirely implemented in libc++ yet.
99 // See: https://libcxx.llvm.org/Status/Cxx20.html#note-p0355
100 // For now, we use HowardHinnant/date as a replacement if we are compiling with libc++.
101 // TODO: use the following once libc++ has full support for P0355R7.
102 // using timestamp = std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
103 template <typename Duration, typename TimeZonePtr = const date::time_zone*>
104 using timestamp = date::zoned_time<Duration, TimeZonePtr>;
105
106 // We need to be sure the current target platform is setup to support correctly these types.
107 static_assert(sizeof(float16_t) == 2);
108 static_assert(sizeof(float32_t) == 4);
109 static_assert(sizeof(float64_t) == 8);
110 static_assert(std::is_floating_point_v<float16_t>);
111 static_assert(std::is_floating_point_v<float32_t>);
112 static_assert(std::is_floating_point_v<float64_t>);
113 static_assert(CHAR_BIT == 8);
114
115 using byte_t = std::byte; // For now we will use this to represent raw data TODO: evaluate later if it's
116 // the right choice, switch to char if not
117
119 {
120 };
121
122 constexpr bool operator==(const null_type&, const null_type&) noexcept
123 {
124 return true;
125 }
126
128 // TODO: does not support all types specified by the Arrow specification
129 // yet
186
187 // helper function to check if a string is all digits
188 [[nodiscard]] constexpr bool all_digits(const std::string_view s)
189 {
190 return !s.empty()
191 && std::find_if(
192 s.begin(),
193 s.end(),
194 [](unsigned char c)
195 {
196 return !std::isdigit(c);
197 }
198 ) == s.end();
199 }
200
201 // get the bit width for decimal value type from format
202 [[nodiscard]] SPARROW_API std::size_t num_bytes_for_decimal(const char* format);
203
206 // TODO: consider returning an optional instead
207 [[nodiscard]] constexpr data_type format_to_data_type(std::string_view format)
208 {
209 // TODO: add missing conversions from
210 // https://arrow.apache.org/docs/dev/format/CDataInterface.html#data-type-description-format-strings
211 if (format.size() == 1)
212 {
213 switch (format[0])
214 {
215 case 'n':
216 return data_type::NA;
217 case 'b':
218 return data_type::BOOL;
219 case 'C':
220 return data_type::UINT8;
221 case 'c':
222 return data_type::INT8;
223 case 'S':
224 return data_type::UINT16;
225 case 's':
226 return data_type::INT16;
227 case 'I':
228 return data_type::UINT32;
229 case 'i':
230 return data_type::INT32;
231 case 'L':
232 return data_type::UINT64;
233 case 'l':
234 return data_type::INT64;
235 case 'e':
237 case 'f':
238 return data_type::FLOAT;
239 case 'g':
240 return data_type::DOUBLE;
241 case 'u':
242 return data_type::STRING;
243 case 'U':
245 case 'z':
246 return data_type::BINARY;
247 case 'Z':
249 default:
250 return data_type::NA;
251 }
252 }
253 else if (format == "vu") // string view
254 {
256 }
257 else if (format == "vz") // binary view
258 {
260 }
261 // TODO: add propper timestamp support below
262 else if (format.starts_with("t"))
263 {
264 if (format == "tdD")
265 {
267 }
268 else if (format == "tdm")
269 {
271 }
272 else if (format.starts_with("tss:"))
273 {
275 }
276 else if (format.starts_with("tsm:"))
277 {
279 }
280 else if (format.starts_with("tsu:"))
281 {
283 }
284 else if (format.starts_with("tsn:"))
285 {
287 }
288 else if (format == "tDs")
289 {
291 }
292 else if (format == "tDm")
293 {
295 }
296 else if (format == "tDu")
297 {
299 }
300 else if (format == "tDn")
301 {
303 }
304 else if (format == "tiM")
305 {
307 }
308 else if (format == "tiD")
309 {
311 }
312 else if (format == "tin")
313 {
315 }
316 else if (format == "tts")
317 {
319 }
320 else if (format == "ttm")
321 {
323 }
324 else if (format == "ttu")
325 {
327 }
328 else if (format == "ttn")
329 {
331 }
332 }
333 else if (format == "+l")
334 {
335 return data_type::LIST;
336 }
337 else if (format == "+L")
338 {
340 }
341 else if (format == "+vl")
342 {
344 }
345 else if (format == "+vL")
346 {
348 }
349 else if (format.starts_with("+w:"))
350 {
352 }
353 else if (format == "+s")
354 {
355 return data_type::STRUCT;
356 }
357 else if (format == "+m")
358 {
359 return data_type::MAP;
360 }
361 else if (format.starts_with("+ud:"))
362 {
364 }
365 else if (format.starts_with("+us:"))
366 {
368 }
369 else if (format.starts_with("+r"))
370 {
372 }
373 else if (format.starts_with("d:"))
374 {
375 const auto num_bytes = num_bytes_for_decimal(format.data());
376 switch (num_bytes)
377 {
378 case 4:
380 case 8:
382 case 16:
384 case 32:
386 default:
387 throw std::runtime_error("Invalid format for decimal");
388 }
389 }
390 else if (format.starts_with("w:"))
391 {
393 }
394
395 return data_type::NA;
396 }
397
401 template <std::floating_point T>
402 requires(sizeof(T) >= 2 && sizeof(T) <= 8)
403 [[nodiscard]] constexpr data_type data_type_from_size(T = {}) noexcept
404 {
405 // TODO: consider rewriting this to benefit from if constexpr? might not be necessary
406 switch (sizeof(T))
407 {
408 case 2:
410 case 4:
411 return data_type::FLOAT;
412 case 8:
413 return data_type::DOUBLE;
414 }
415
417 }
418
422 template <std::integral T>
423 requires(sizeof(T) >= 1 && sizeof(T) <= 8)
424 [[nodiscard]] constexpr data_type data_type_from_size(T = {}) noexcept
425 {
426 if constexpr (std::same_as<bool, T>)
427 {
428 return data_type::BOOL;
429 }
430 else if constexpr (std::signed_integral<T>)
431 {
432 // TODO: consider rewriting this to benefit from if constexpr? might not be necessary
433 switch (sizeof(T))
434 {
435 case 1:
436 return data_type::INT8;
437 case 2:
438 return data_type::INT16;
439 case 4:
440 return data_type::INT32;
441 case 8:
442 return data_type::INT64;
443 }
444 }
445 else
446 {
447 static_assert(std::unsigned_integral<T>);
448
449 // TODO: consider rewriting this to benefit from if constexpr? might not be necessary
450 switch (sizeof(T))
451 {
452 case 1:
453 return data_type::UINT8;
454 case 2:
455 return data_type::UINT16;
456 case 4:
457 return data_type::UINT32;
458 case 8:
459 return data_type::UINT64;
460 }
461 }
462
464 }
465
466 namespace detail
467 {
481 template <class A>
483 }
484
485 // REMARK: this functions is non-applicable for the following types
486 // - all decimal types because further information is needed (precision, scale)
487 // - fixed-sized binary because further information is needed (element size)
491 [[nodiscard]] constexpr std::string_view data_type_to_format(data_type type)
492 {
493 switch (type)
494 {
495 case data_type::NA:
496 return "n";
497 case data_type::BOOL:
498 return "b";
499 case data_type::UINT8:
500 return "C";
501 case data_type::INT8:
502 return "c";
504 return "S";
505 case data_type::INT16:
506 return "s";
508 return "I";
509 case data_type::INT32:
510 return "i";
512 return "L";
513 case data_type::INT64:
514 return "l";
516 return "e";
517 case data_type::FLOAT:
518 return "f";
520 return "g";
522 return "u";
524 return "U";
526 return "z";
528 return "Z";
530 return "tdD";
532 return "tdm";
534 return "tss:";
536 return "tsm:";
538 return "tsu:";
540 return "tsn:";
542 return "tDs";
544 return "tDm";
546 return "tDu";
548 return "tDn";
550 return "tiM";
552 return "tiD";
554 return "tin";
556 return "tts";
558 return "ttm";
560 return "ttu";
562 return "ttn";
563 case data_type::LIST:
564 return "+l";
566 return "+L";
568 return "+s";
569 case data_type::MAP:
570 return "+m";
571 default:
572 // TODO: add missing types
573 throw std::runtime_error("Unsupported data type");
574 }
575 }
576
578 [[nodiscard]] constexpr bool data_type_is_primitive(data_type dt) noexcept
579 {
580 switch (dt)
581 {
582 case data_type::BOOL:
583 case data_type::UINT8:
584 case data_type::INT8:
586 case data_type::INT16:
588 case data_type::INT32:
590 case data_type::INT64:
592 case data_type::FLOAT:
594 return true;
595 default:
596 return false;
597 }
598 }
599
601 [[nodiscard]] constexpr bool data_type_is_integer(data_type dt) noexcept
602 {
603 switch (dt)
604 {
605 case data_type::UINT8:
606 case data_type::INT8:
608 case data_type::INT16:
610 case data_type::INT32:
612 case data_type::INT64:
613 return true;
614 default:
615 return false;
616 }
617 }
618
619 class list_value;
620 class struct_value;
621 class map_value;
622
624 // NOTE: this needs to be in sync-order with `data_type`
626 null_type,
627 bool,
628 std::uint8_t,
629 std::int8_t,
630 std::uint16_t,
631 std::int16_t,
632 std::uint32_t,
633 std::int32_t,
634 std::uint64_t,
635 std::int64_t,
636 float16_t,
637 float32_t,
638 float64_t,
639 std::string,
640 std::vector<byte_t>,
641 date_days,
651 std::chrono::seconds,
652 std::chrono::milliseconds,
653 std::chrono::microseconds,
654 std::chrono::nanoseconds,
662 // TODO: add missing fundamental types here
665 map_value,
670
672 template <class T>
674
676 // template <class T>
677 // concept is_arrow_base_type_or_compound = is_arrow_base_type<T> || is_list_value_v<T>;
679
688 template <class T>
690
699 template <class T>
700 using get_corresponding_arrow_type_t = std::conditional_t<std::same_as<T, std::string_view>, std::string, T>;
701
715 template <class T>
717
718 namespace detail
719 {
720 template <template <class> class>
722 {
723 };
724 }
729 template <class T>
732 typename T::value_type;
733
735 // typename detail::accepts_template<T::template default_layout>;
736
737 // TODO: add more interface requirements on the traits here
738 // TODO: add conversion operations between bytes and the value type
739 };
740
741
744 template <class T>
745 concept has_arrow_type_traits = requires { typename ::sparrow::arrow_traits<T>; }
747
750 template <class T>
752
753
755 template <has_arrow_type_traits T>
757
758 // For now, a tiny wrapper around data_type
759 // TODO: More data and functions to come
761 {
762 public:
763
764 constexpr data_descriptor() noexcept
766 {
767 }
768
769 data_descriptor(std::string_view format) noexcept
771 {
772 }
773
774 constexpr explicit data_descriptor(data_type id) noexcept
775 : m_id(id)
776 {
777 }
778
779 [[nodiscard]] constexpr data_type id() const noexcept
780 {
781 return m_id;
782 }
783
784 private:
785
786 data_type m_id;
787 };
788
789 namespace impl
790 {
791 template <class C, bool is_const>
793 : std::conditional<is_const, typename C::inner_const_reference, typename C::inner_reference>
794 {
795 };
796
797 template <class C, bool is_const>
799 } // namespace impl
800
801 template <class T>
802 concept layout_offset = std::same_as<T, std::int32_t> || std::same_as<T, std::int64_t>;
803}
804
805#if defined(__cpp_lib_format)
806
807namespace std
808{
809 template <>
810 struct formatter<sparrow::data_type>
811 {
812 constexpr auto parse(std::format_parse_context& ctx)
813 {
814 return ctx.begin(); // Simple implementation
815 }
816
817 auto format(const sparrow::data_type& data_type, std::format_context& ctx) const
818 {
819 static const auto get_enum_name = [](sparrow::data_type dt) -> std::string_view
820 {
821 using enum sparrow::data_type;
822 switch (dt)
823 {
824 case NA:
825 return "N/A";
826 case BOOL:
827 return "bool";
828 case UINT8:
829 return "uint8";
830 case INT8:
831 return "int8";
832 case UINT16:
833 return "uint16";
834 case INT16:
835 return "int16";
836 case UINT32:
837 return "uint32";
838 case INT32:
839 return "int32";
840 case UINT64:
841 return "uint64";
842 case INT64:
843 return "int64";
844 case HALF_FLOAT:
845 return "float16";
846 case FLOAT:
847 return "float32";
848 case DOUBLE:
849 return "double";
850 case STRING:
851 return "String";
852 case LARGE_STRING:
853 return "Large string";
854 case BINARY:
855 return "Binary";
856 case LARGE_BINARY:
857 return "Large binary";
858 case DATE_DAYS:
859 return "Date days";
861 return "Date milliseconds";
863 return "Timestamp seconds";
865 return "Timestamp milliseconds";
867 return "Timestamp microseconds";
869 return "Timestamp nanoseconds";
870 case DURATION_SECONDS:
871 return "Duration seconds";
873 return "Duration milliseconds";
875 return "Duration microseconds";
877 return "Duration nanoseconds";
878 case INTERVAL_MONTHS:
879 return "Interval months";
881 return "Interval days time";
883 return "Interval months days nanoseconds";
884 case TIME_SECONDS:
885 return "Time seconds";
887 return "Time milliseconds";
889 return "Time microseconds";
890 case TIME_NANOSECONDS:
891 return "Time nanoseconds";
892 case LIST:
893 return "List";
894 case LARGE_LIST:
895 return "Large list";
896 case LIST_VIEW:
897 return "List view";
898 case LARGE_LIST_VIEW:
899 return "Large list view";
900 case FIXED_SIZED_LIST:
901 return "Fixed sized list";
902 case STRUCT:
903 return "Struct";
904 case MAP:
905 return "Map";
906 case DENSE_UNION:
907 return "Dense union";
908 case SPARSE_UNION:
909 return "Sparse union";
910 case RUN_ENCODED:
911 return "Run encoded";
912 case DECIMAL32:
913 return "Decimal32";
914 case DECIMAL64:
915 return "Decimal64";
916 case DECIMAL128:
917 return "Decimal128";
918 case DECIMAL256:
919 return "Decimal256";
921 return "Fixed width binary";
922 case STRING_VIEW:
923 return "String view";
924 case BINARY_VIEW:
925 return "Binary view";
926 };
927 return "UNKNOWN";
928 };
929
930 return std::format_to(ctx.out(), "{}", get_enum_name(data_type));
931 }
932 };
933}
934
935template <>
936struct std::formatter<sparrow::null_type>
937{
938 constexpr auto parse(std::format_parse_context& ctx)
939 {
940 return ctx.begin(); // Simple implementation
941 }
942
943 auto format(const sparrow::null_type&, std::format_context& ctx) const
944 {
945 return std::format_to(ctx.out(), "null_type");
946 }
947};
948
949inline std::ostream& operator<<(std::ostream& os, const sparrow::null_type&)
950{
951 os << std::format("{}", "null");
952 return os;
953}
954
955template <>
956struct std::formatter<std::byte>
957{
958 constexpr auto parse(std::format_parse_context& ctx)
959 {
960 return ctx.begin(); // Simple implementation
961 }
962
963 auto format(const std::byte& b, std::format_context& ctx) const
964 {
965 return std::format_to(ctx.out(), "{}", static_cast<int>(b));
966 }
967};
968
969#endif
constexpr data_type id() const noexcept
constexpr data_descriptor(data_type id) noexcept
constexpr data_descriptor() noexcept
data_descriptor(std::string_view format) noexcept
Matches any type which is one of the base C++ types supported or at least that provides an arrow_trai...
Matches types providing valid and complete arrow_traits specialization.
Checks if a type is an extended base type for Arrow.
Matches C++ representation types which are supported by default.
Matches valid and complete arrow_traits specializations for type T.
#define SPARROW_API
Definition config.hpp:38
std::chrono::duration< int32_t, std::ratio< 2629746 > > months
typename get_inner_reference< C, is_const >::type get_inner_reference_t
decltype(append(TypeList{}, Us{}...)) append_t
Type alias for appending types or typelists to a given typelist.
Definition mp_utils.hpp:198
constexpr bool is_type_instance_of_v
Variable template for convenient access to is_type_instance_of.
Definition mp_utils.hpp:102
void unreachable()
Invokes undefined behavior for optimization purposes.
Definition mp_utils.hpp:882
consteval bool contains()
Checks if a typelist contains a specific type.
Definition mp_utils.hpp:633
std::byte byte_t
constexpr std::string_view data_type_to_format(data_type type)
SPARROW_API bool operator==(const array &lhs, const array &rhs)
Compares the content of two arrays.
constexpr bool data_type_is_primitive(data_type dt) noexcept
half_float::half float16_t
Definition data_type.hpp:93
SPARROW_API std::size_t num_bytes_for_decimal(const char *format)
float float32_t
Definition data_type.hpp:94
mpl::append_t< all_base_types_t, char, std::string_view > all_base_types_extended_t
is arrow base type or arrow compound type (list<T>, struct<T> etc.)
date::zoned_time< Duration, TimeZonePtr > timestamp
double float64_t
Definition data_type.hpp:95
std::conditional_t< std::same_as< T, std::string_view >, std::string, T > get_corresponding_arrow_type_t
Template alias to get the corresponding Arrow type for a given type.
typename arrow_traits< T >::default_layout default_layout_t
Binary layout type to use by default for the given C++ representation T of an arrow value.
constexpr data_type data_type_from_size(T={}) noexcept
constexpr bool all_digits(const std::string_view s)
std::chrono::time_point< std::chrono::system_clock, std::chrono::milliseconds > date_milliseconds
constexpr bool data_type_is_integer(data_type dt) noexcept
mpl::typelist< null_type, bool, std::uint8_t, std::int8_t, std::uint16_t, std::int16_t, std::uint32_t, std::int32_t, std::uint64_t, std::int64_t, float16_t, float32_t, float64_t, std::string, std::vector< byte_t >, date_days, date_milliseconds, timestamp< std::chrono::seconds >, timestamp< std::chrono::milliseconds >, timestamp< std::chrono::microseconds >, timestamp< std::chrono::nanoseconds >, zoned_time_without_timezone_seconds, zoned_time_without_timezone_milliseconds, zoned_time_without_timezone_microseconds, zoned_time_without_timezone_nanoseconds, std::chrono::seconds, std::chrono::milliseconds, std::chrono::microseconds, std::chrono::nanoseconds, chrono::months, days_time_interval, month_day_nanoseconds_interval, chrono::time_seconds, chrono::time_milliseconds, chrono::time_microseconds, chrono::time_nanoseconds, list_value, struct_value, map_value, decimal< std::int32_t >, decimal< std::int64_t >, decimal< int128_t >, decimal< int256_t > > all_base_types_t
C++ types value representation types matching Arrow types.
constexpr data_type format_to_data_type(std::string_view format)
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
std::chrono::time_point< std::chrono::system_clock, chrono::days > date_days
std::ostream & operator<<(std::ostream &os, const sparrow::nullval_t &)
Provides compile-time information about Arrow data types.
A duration representing time elapsed since midnight, in microseconds.
A duration representing time elapsed since midnight, in milliseconds.
A duration representing time elapsed since midnight, in nanoseconds.
A duration representing time elapsed since midnight.
Metafunction for retrieving the data_type of a typed array.
A sequence of types used for metaprogramming operations.
Definition mp_utils.hpp:123
A zoned time value without timezone, in microseconds.
A zoned time value without timezone, in milliseconds.
A zoned time value without timezone, in nanoseconds.
A zoned time value without timezone, in seconds.