sparrow 1.2.0
Loading...
Searching...
No Matches
record_batch.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or mplied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include <algorithm>
18#include <initializer_list>
19#include <optional>
20#include <ranges>
21#include <string>
22#include <unordered_map>
23#include <vector>
24
25#include "sparrow/array.hpp"
28
29#if defined(__cpp_lib_format)
31#endif
32
33namespace sparrow
34{
70 {
71 public:
72
73 using name_type = std::string;
74 using size_type = std::size_t;
75 using initializer_type = std::initializer_list<std::pair<name_type, array>>;
76
77 using name_range = std::ranges::ref_view<const std::vector<name_type>>;
78 using column_range = std::ranges::ref_view<const std::vector<array>>;
79
104 template <
105 std::ranges::input_range NR,
106 std::ranges::input_range CR,
107 input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
108 requires(
109 std::convertible_to<std::ranges::range_value_t<NR>, std::string>
110 and std::same_as<std::ranges::range_value_t<CR>, array>
111 )
112 constexpr record_batch(
113 NR&& names,
114 CR&& columns,
115 std::optional<std::string_view> name = std::nullopt,
116 std::optional<METADATA_RANGE> metadata = std::nullopt
117 );
118
140 template <std::ranges::input_range CR, input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
141 requires std::same_as<std::ranges::range_value_t<CR>, array>
143 CR&& columns,
144 std::optional<std::string_view> name = std::nullopt,
145 std::optional<METADATA_RANGE> metadata = std::nullopt
146 );
147
161
171
182
193
203
213
230
242
256
257 record_batch(record_batch&&) noexcept = default;
258 record_batch& operator=(record_batch&&) = default;
259
269
280
290 SPARROW_API bool contains_column(const name_type& key) const;
291
305
318 SPARROW_API const array& get_column(const name_type& key) const;
319
333
347
361
369 SPARROW_API const std::optional<name_type>& name() const;
370
381
392
408
427
444
445 private:
446
447 template <class AS>
448 void init(ArrowArray&& arr, AS* sch);
449
450 template <class AA, class AS>
451 void init(AA* arr, AS* sch);
452
453 SPARROW_API void partial_init_from_schema(const ArrowSchema& sch);
454
466 template <class U, class R>
467 [[nodiscard]] std::vector<U> to_vector(R&& range) const;
468
476 SPARROW_API void update_array_map_cache() const;
477
493 SPARROW_API void check_consistency() const;
494
495 using metadata_type = std::vector<metadata_pair>;
496 std::optional<name_type> m_name = std::nullopt;
497 std::optional<metadata_type> m_metadata = std::nullopt;
498 std::vector<name_type> m_name_list;
499 std::vector<array> m_array_list;
500 mutable std::unordered_map<name_type, array*> m_array_map;
502 mutable bool m_dirty_map = true;
503 };
504
522 bool operator==(const record_batch& lhs, const record_batch& rhs);
523
525
526 /*******************************
527 * record_batch implementation *
528 *******************************/
529
530 template <std::ranges::input_range NR, std::ranges::input_range CR, input_metadata_container METADATA_RANGE>
531 requires(std::convertible_to<std::ranges::range_value_t<NR>, std::string>
532 and std::same_as<std::ranges::range_value_t<CR>, array>)
534 NR&& names,
535 CR&& columns,
536 std::optional<std::string_view> name,
537 std::optional<METADATA_RANGE> metadata
538 )
539 : m_name(name)
540 , m_metadata(std::move(metadata))
541 , m_name_list(to_vector<name_type>(std::forward<NR>(names)))
542 , m_array_list(to_vector<array>(std::forward<CR>(columns)))
543 {
544 update_array_map_cache();
545 }
546
547 namespace detail
548 {
549 inline std::vector<record_batch::name_type> get_names(const std::vector<array>& array_list)
550 {
551 const auto names = array_list
552 | std::views::transform(
553 [](const array& ar)
554 {
555 return ar.name().value();
556 }
557 );
558 return {names.begin(), names.end()};
559 }
560 }
561
562 template <std::ranges::input_range CR, input_metadata_container METADATA_RANGE>
563 requires std::same_as<std::ranges::range_value_t<CR>, array>
564 record_batch::record_batch(CR&& columns, std::optional<std::string_view> name, std::optional<METADATA_RANGE> metadata)
565 : m_name(name)
566 , m_metadata(std::move(metadata))
567 , m_name_list(detail::get_names(columns))
568 , m_array_list(to_vector<array>(std::move(columns)))
569 {
570 update_array_map_cache();
571 }
572
573 template <class AS>
574 void record_batch::init(ArrowArray&& arr, AS* sch)
575 {
576 partial_init_from_schema(*sch);
577 std::size_t column_size = m_name_list.capacity();
578 for (std::size_t i = 0; i < column_size; ++i)
579 {
580 m_name_list.emplace_back(sch->children[i]->name);
581 m_array_list.emplace_back(std::move(*(arr.children[i])), sch->children[i]);
582 *(arr.children[i]) = make_empty_arrow_array();
583 }
584 arr.release(&arr);
585 update_array_map_cache();
586 }
587
588 template <class AA, class AS>
589 void record_batch::init(AA* arr, AS* sch)
590 {
591 partial_init_from_schema(*sch);
592 std::size_t column_size = m_name_list.capacity();
593 for (std::size_t i = 0; i < column_size; ++i)
594 {
595 m_name_list.emplace_back(sch->children[i]->name);
596 m_array_list.emplace_back(arr->children[i], sch->children[i]);
597 }
598 update_array_map_cache();
599 }
600
601 template <class U, class R>
602 std::vector<U> record_batch::to_vector(R&& range) const
603 {
604 std::vector<U> v;
605 if constexpr (std::ranges::sized_range<decltype(range)>)
606 {
607 v.reserve(std::ranges::size(range));
608 }
609 if constexpr (std::is_lvalue_reference_v<R>)
610 {
611 std::ranges::copy(range, std::back_inserter(v));
612 }
613 else
614 {
615 std::ranges::move(range, std::back_inserter(v));
616 }
617 return v;
618 }
619}
620
621#if defined(__cpp_lib_format)
622template <>
623struct std::formatter<sparrow::record_batch>
624{
625 constexpr auto parse(std::format_parse_context& ctx)
626 {
627 return ctx.begin(); // Simple implementation
628 }
629
630 auto format(const sparrow::record_batch& rb, std::format_context& ctx) const
631 {
632 const auto values_by_columns = rb.columns()
633 | std::views::transform(
634 [&rb](const auto& ar)
635 {
636 return std::views::iota(0u, rb.nb_rows())
637 | std::views::transform(
638 [&ar](const auto i)
639 {
640 return ar[i];
641 }
642 );
643 }
644 );
645
646 sparrow::to_table_with_columns(ctx.out(), rb.names(), values_by_columns);
647 return ctx.out();
648 }
649};
650
651namespace sparrow
652{
653 inline std::ostream& operator<<(std::ostream& os, const record_batch& value)
654 {
655 os << std::format("{}", value);
656 return os;
657 }
658}
659
660#endif
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:43
SPARROW_API std::optional< std::string_view > name() const
SPARROW_API record_batch(const record_batch &other)
Copy constructor.
SPARROW_API record_batch(ArrowArray &&array, ArrowSchema &&schema)
Constructs a record_batch from the given Arrow C structures, whose ownership is transferred to the re...
SPARROW_API const std::optional< name_type > & name() const
Gets the name of the record batch.
std::initializer_list< std::pair< name_type, array > > initializer_type
SPARROW_API const array & get_column(const name_type &key) const
Gets the column with the specified name.
SPARROW_API record_batch(struct_array &&ar)
Constructs a record_batch from a struct_array.
record_batch(record_batch &&) noexcept=default
SPARROW_API name_range names() const
Gets a range view of the column names.
SPARROW_API bool contains_column(const name_type &key) const
Checks if the record batch contains a column with the specified name.
SPARROW_API struct_array extract_struct_array()
Moves the internal columns into a struct_array and empties the record batch.
SPARROW_API column_range columns() const
Gets a range view of the columns.
SPARROW_API record_batch(ArrowArray &&array, ArrowSchema *schema)
Constructs an record_batch from the given Arrow C structures.
constexpr record_batch(NR &&names, CR &&columns, std::optional< std::string_view > name=std::nullopt, std::optional< METADATA_RANGE > metadata=std::nullopt)
Constructs a record_batch from separate name and array ranges.
SPARROW_API const name_type & get_column_name(size_type index) const
Gets the name of the column at the specified index.
SPARROW_API record_batch(ArrowArray *array, ArrowSchema *schema)
Constructs an record_batch from the given Arrow C structures.
SPARROW_API record_batch(ArrowArray &&array, const ArrowSchema *schema)
Constructs an record_batch from the given Arrow C structures.
SPARROW_API void add_column(name_type name, array column)
Adds a new column to the record batch with the specified name.
SPARROW_API size_type nb_rows() const
Gets the number of rows in the record batch.
SPARROW_API record_batch & operator=(const record_batch &other)
Copy assignment operator.
SPARROW_API record_batch(const ArrowArray *array, const ArrowSchema *schema)
Constructs an record_batch from the given Arrow C structures.
SPARROW_API size_type nb_columns() const
Gets the number of columns in the record batch.
SPARROW_API record_batch(initializer_type init)
Constructs a record_batch from initializer list of name-array pairs.
std::ranges::ref_view< const std::vector< array > > column_range
std::ranges::ref_view< const std::vector< name_type > > name_range
Concept for input containers that can provide metadata pairs.
Definition metadata.hpp:304
#define SPARROW_API
Definition config.hpp:38
std::vector< record_batch::name_type > get_names(const std::vector< array > &array_list)
ArrowArray make_empty_arrow_array()
std::pair< ArrowArray, ArrowSchema > extract_arrow_structures(A &&a)
Extracts the internal ArrowArray and ArrowSchema structures from the given array or typed layout.
Definition array.hpp:110
std::ostream & operator<<(std::ostream &os, const nullval_t &)
constexpr void to_table_with_columns(OutputIt out, const Headers &headers, const Columns &columns)
Definition format.hpp:139
std::pair< metadata_key, metadata_value > metadata_pair
Type alias for metadata key-value pairs.
Definition metadata.hpp:61