sparrow 1.3.0
Loading...
Searching...
No Matches
record_batch.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <algorithm>
18#include <initializer_list>
19#include <optional>
20#include <ranges>
21#include <string>
22#include <unordered_map>
23#include <vector>
24
25#include "sparrow/array.hpp"
28
29#if defined(__cpp_lib_format)
31#endif
32
33namespace sparrow
34{
79 {
80 public:
81
82 using name_type = std::string;
83 using size_type = std::size_t;
84 using initializer_type = std::initializer_list<std::pair<name_type, array>>;
85
86 using name_range = std::ranges::ref_view<const std::vector<name_type>>;
87
94 record_batch() = default;
95
120 template <
121 std::ranges::input_range NR,
122 std::ranges::input_range CR,
123 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
124 requires(
125 std::convertible_to<std::ranges::range_value_t<NR>, std::string>
126 and std::same_as<std::ranges::range_value_t<CR>, array>
127 )
128 constexpr record_batch(
129 NR&& names,
130 CR&& columns,
131 std::optional<std::string_view> name = std::nullopt,
132 std::optional<METADATA_RANGE> metadata = std::nullopt
133 );
134
156 template <std::ranges::input_range CR, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
157 requires std::same_as<std::ranges::range_value_t<CR>, array>
159 CR&& columns,
160 std::optional<std::string_view> name = std::nullopt,
161 std::optional<METADATA_RANGE> metadata = std::nullopt
162 );
163
177
187
198
209
219
229
246
258
272
273 record_batch(record_batch&&) noexcept = default;
274 record_batch& operator=(record_batch&&) = default;
275
285
296
306 SPARROW_API bool contains_column(const name_type& key) const;
307
321
334 SPARROW_API const array& get_column(const name_type& key) const;
335
349
363
377
385 SPARROW_API const std::optional<name_type>& name() const;
386
397
407 auto columns() const
408 {
409 return std::views::iota(size_type{0}, nb_columns())
410 | std::views::transform(
411 [this](size_type i) -> const array&
412 {
413 return get_column(i);
414 }
415 );
416 }
417
433
452
469
492
513
514 private:
515
516 template <class AS>
517 void init(ArrowArray&& arr, AS* sch);
518
519 template <class AA, class AS>
520 void init(AA* arr, AS* sch);
521
522 SPARROW_API void partial_init_from_schema(const ArrowSchema& sch);
523
535 template <class U, class R>
536 [[nodiscard]] std::vector<U> to_vector(R&& range) const;
537
545 SPARROW_API void update_array_map_cache() const;
546
562 SPARROW_API void check_consistency() const;
563
564 using metadata_type = std::vector<metadata_pair>;
565 using array_storage_type = std::variant<array, std::reference_wrapper<array>>;
566
567 std::optional<name_type> m_name = std::nullopt;
568 std::optional<metadata_type> m_metadata = std::nullopt;
569 std::vector<name_type> m_name_list;
570 std::vector<array_storage_type> m_array_list;
572 mutable std::unordered_map<name_type, array*> m_array_map;
574 mutable bool m_dirty_map = true;
575
579 SPARROW_API static array* get_array_ptr(array_storage_type& storage);
580 SPARROW_API static const array* get_array_ptr(const array_storage_type& storage);
581 };
582
600 bool operator==(const record_batch& lhs, const record_batch& rhs);
601
602 SPARROW_API std::pair<ArrowArray, ArrowSchema> extract_arrow_structures(sparrow::record_batch&& rb);
603
604 /*******************************
605 * record_batch implementation *
606 *******************************/
607
608 template <std::ranges::input_range NR, std::ranges::input_range CR, input_metadata_container METADATA_RANGE>
609 requires(std::convertible_to<std::ranges::range_value_t<NR>, std::string>
610 and std::same_as<std::ranges::range_value_t<CR>, array>)
612 NR&& names,
613 CR&& columns,
614 std::optional<std::string_view> name,
615 std::optional<METADATA_RANGE> metadata
616 )
617 : m_name(name)
618 , m_metadata(std::move(metadata))
619 , m_name_list(to_vector<name_type>(std::forward<NR>(names)))
620 {
621 m_array_list.reserve(std::ranges::size(columns));
622 for (auto& col : columns)
623 {
624 if constexpr (std::is_lvalue_reference_v<CR>)
625 {
626 m_array_list.emplace_back(col);
627 }
628 else
629 {
630 m_array_list.emplace_back(std::move(col));
631 }
632 }
633 update_array_map_cache();
634 }
635
636 namespace detail
637 {
638 inline std::vector<record_batch::name_type> get_names(const std::vector<array>& array_list)
639 {
640 const auto names = array_list
641 | std::views::transform(
642 [](const array& ar)
643 {
644 return ar.name().value();
645 }
646 );
647 return {names.begin(), names.end()};
648 }
649 }
650
651 template <std::ranges::input_range CR, input_metadata_container METADATA_RANGE>
652 requires std::same_as<std::ranges::range_value_t<CR>, array>
653 record_batch::record_batch(CR&& columns, std::optional<std::string_view> name, std::optional<METADATA_RANGE> metadata)
654 : m_name(name)
655 , m_metadata(std::move(metadata))
656 , m_name_list(detail::get_names(columns))
657 {
658 m_array_list.reserve(std::ranges::size(columns));
659 for (auto& col : columns)
660 {
661 if constexpr (std::is_lvalue_reference_v<CR>)
662 {
663 m_array_list.emplace_back(col);
664 }
665 else
666 {
667 m_array_list.emplace_back(std::move(col));
668 }
669 }
670 update_array_map_cache();
671 }
672
673 template <class AS>
674 void record_batch::init(ArrowArray&& arr, AS* sch)
675 {
676 partial_init_from_schema(*sch);
677 std::size_t column_size = m_name_list.capacity();
678 for (std::size_t i = 0; i < column_size; ++i)
679 {
680 m_name_list.emplace_back(sch->children[i]->name);
681 m_array_list.emplace_back(array(std::move(*(arr.children[i])), sch->children[i]));
682 *(arr.children[i]) = make_empty_arrow_array();
683 }
684 arr.release(&arr);
685 update_array_map_cache();
686 }
687
688 template <class AA, class AS>
689 void record_batch::init(AA* arr, AS* sch)
690 {
691 partial_init_from_schema(*sch);
692 std::size_t column_size = m_name_list.capacity();
693 for (std::size_t i = 0; i < column_size; ++i)
694 {
695 m_name_list.emplace_back(sch->children[i]->name);
696 m_array_list.emplace_back(array(arr->children[i], sch->children[i]));
697 }
698 update_array_map_cache();
699 }
700
701 template <class U, class R>
702 std::vector<U> record_batch::to_vector(R&& range) const
703 {
704 std::vector<U> v;
705 if constexpr (std::ranges::sized_range<decltype(range)>)
706 {
707 v.reserve(std::ranges::size(range));
708 }
709 if constexpr (std::is_lvalue_reference_v<R>)
710 {
711 std::ranges::copy(range, std::back_inserter(v));
712 }
713 else
714 {
715 std::ranges::move(range, std::back_inserter(v));
716 }
717 return v;
718 }
719}
720
721#if defined(__cpp_lib_format)
722template <>
723struct std::formatter<sparrow::record_batch>
724{
725 constexpr auto parse(std::format_parse_context& ctx)
726 {
727 return ctx.begin(); // Simple implementation
728 }
729
730 auto format(const sparrow::record_batch& rb, std::format_context& ctx) const
731 {
732 auto columns_view = rb.columns();
733 std::vector<std::vector<sparrow::array_traits::const_reference>> values_by_columns;
734 values_by_columns.reserve(rb.nb_columns());
735
736 for (const auto& ar : columns_view)
737 {
738 std::vector<sparrow::array_traits::const_reference> column_values;
739 column_values.reserve(rb.nb_rows());
740 for (std::size_t i = 0; i < rb.nb_rows(); ++i)
741 {
742 column_values.push_back(ar[i]);
743 }
744 values_by_columns.push_back(std::move(column_values));
745 }
746
747 sparrow::to_table_with_columns(ctx.out(), rb.names(), values_by_columns);
748 return ctx.out();
749 }
750};
751
752namespace sparrow
753{
754 inline std::ostream& operator<<(std::ostream& os, const record_batch& value)
755 {
756 os << std::format("{}", value);
757 return os;
758 }
759}
760
761#endif
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:43
SPARROW_API std::optional< std::string_view > name() const
SPARROW_API record_batch(const record_batch &other)
Copy constructor.
SPARROW_API record_batch(ArrowArray &&array, ArrowSchema &&schema)
Constructs a record_batch from the given Arrow C structures, whose ownership is transferred to the re...
SPARROW_API const std::optional< name_type > & name() const
Gets the name of the record batch.
SPARROW_API void add_column_reference(name_type name, array &column)
Adds a column by reference with the specified name.
std::initializer_list< std::pair< name_type, array > > initializer_type
SPARROW_API void add_column(array column)
Adds a new column using the array's internal name.
record_batch()=default
Default constructor creating an empty record batch.
SPARROW_API const array & get_column(const name_type &key) const
Gets the column with the specified name.
SPARROW_API record_batch(struct_array &&ar)
Constructs a record_batch from a struct_array.
record_batch(record_batch &&) noexcept=default
SPARROW_API name_range names() const
Gets a range view of the column names.
SPARROW_API bool contains_column(const name_type &key) const
Checks if the record batch contains a column with the specified name.
SPARROW_API struct_array extract_struct_array()
Moves the internal columns into a struct_array and empties the record batch.
SPARROW_API record_batch(ArrowArray &&array, ArrowSchema *schema)
Constructs an record_batch from the given Arrow C structures.
SPARROW_API const name_type & get_column_name(size_type index) const
Gets the name of the column at the specified index.
SPARROW_API record_batch(ArrowArray *array, ArrowSchema *schema)
Constructs an record_batch from the given Arrow C structures.
SPARROW_API record_batch(ArrowArray &&array, const ArrowSchema *schema)
Constructs an record_batch from the given Arrow C structures.
SPARROW_API void add_column(name_type name, array column)
Adds a new column to the record batch with the specified name.
SPARROW_API size_type nb_rows() const
Gets the number of rows in the record batch.
SPARROW_API record_batch & operator=(const record_batch &other)
Copy assignment operator.
SPARROW_API record_batch(const ArrowArray *array, const ArrowSchema *schema)
Constructs an record_batch from the given Arrow C structures.
SPARROW_API size_type nb_columns() const
Gets the number of columns in the record batch.
SPARROW_API void add_column_reference(array &column)
Adds a column by reference using the array's internal name.
SPARROW_API record_batch(initializer_type init)
Constructs a record_batch from initializer list of name-array pairs.
auto columns() const
Gets a range view of the columns.
std::ranges::ref_view< const std::vector< name_type > > name_range
Concept for input containers that can provide metadata pairs.
Definition metadata.hpp:332
#define SPARROW_API
Definition config.hpp:38
std::vector< record_batch::name_type > get_names(const std::vector< array > &array_list)
ArrowArray make_empty_arrow_array()
SPARROW_API bool operator==(const array &lhs, const array &rhs)
Compares the content of two arrays.
std::pair< ArrowArray, ArrowSchema > extract_arrow_structures(A &&a)
Extracts the internal ArrowArray and ArrowSchema structures from the given array or typed layout.
Definition array.hpp:110
std::ostream & operator<<(std::ostream &os, const nullval_t &)
constexpr void to_table_with_columns(OutputIt out, const Headers &headers, const Columns &columns)
Definition format.hpp:139