Creating Arrow Objects¶
Recipes related to the creation of Arrays, Tables, Tensors and all other Arrow entities.
Create Arrays from Standard C++¶
Typed subclasses of arrow::ArrayBuilder
make it easy
to efficiently create Arrow arrays from existing C++ data:
arrow::Int32Builder builder;
ARROW_RETURN_NOT_OK(builder.Append(1));
ARROW_RETURN_NOT_OK(builder.Append(2));
ARROW_RETURN_NOT_OK(builder.Append(3));
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> arr, builder.Finish())
rout << arr->ToString() << std::endl;
[
1,
2,
3
]
Note
Builders will allocate data as needed and insertion should have constant amortized time.
Builders can also consume standard C++ containers:
// Raw pointers
arrow::Int64Builder long_builder = arrow::Int64Builder();
std::array<int64_t, 4> values = {1, 2, 3, 4};
ARROW_RETURN_NOT_OK(long_builder.AppendValues(values.data(), values.size()));
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> arr, long_builder.Finish());
rout << arr->ToString() << std::endl;
// Vectors
arrow::StringBuilder str_builder = arrow::StringBuilder();
std::vector<std::string> strvals = {"x", "y", "z"};
ARROW_RETURN_NOT_OK(str_builder.AppendValues(strvals));
ARROW_ASSIGN_OR_RAISE(arr, str_builder.Finish());
rout << arr->ToString() << std::endl;
// Iterators
arrow::DoubleBuilder dbl_builder = arrow::DoubleBuilder();
std::set<double> dblvals = {1.1, 1.1, 2.3};
ARROW_RETURN_NOT_OK(dbl_builder.AppendValues(dblvals.begin(), dblvals.end()));
ARROW_ASSIGN_OR_RAISE(arr, dbl_builder.Finish());
rout << arr->ToString() << std::endl;
[
1,
2,
3,
4
]
[
"x",
"y",
"z"
]
[
1.1,
2.3
]
Note
Builders will not take ownership of data in containers and will make a copy of the underlying data.
Generate Random Data for a Given Schema¶
To generate random data for a given schema, implementing a type visitor is a good idea. The following example only implements double arrays and list arrays, but could be easily extended to all types.
1class RandomBatchGenerator {
2 public:
3 std::shared_ptr<arrow::Schema> schema;
4
5 RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : schema(schema){};
6
7 arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t num_rows) {
8 num_rows_ = num_rows;
9 for (std::shared_ptr<arrow::Field> field : schema->fields()) {
10 ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
11 }
12
13 return arrow::RecordBatch::Make(schema, num_rows, arrays_);
14 }
15
16 // Default implementation
17 arrow::Status Visit(const arrow::DataType& type) {
18 return arrow::Status::NotImplemented("Generating data for", type.ToString());
19 }
20
21 arrow::Status Visit(const arrow::DoubleType&) {
22 auto builder = arrow::DoubleBuilder();
23 std::normal_distribution<> d{/*mean=*/5.0, /*stddev=*/2.0};
24 for (int32_t i = 0; i < num_rows_; ++i) {
25 ARROW_RETURN_NOT_OK(builder.Append(d(gen_)));
26 }
27 ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
28 arrays_.push_back(array);
29 return arrow::Status::OK();
30 }
31
32 arrow::Status Visit(const arrow::ListType& type) {
33 // Generate offsets first, which determines number of values in sub-array
34 std::poisson_distribution<> d{/*mean=*/4};
35 auto builder = arrow::Int32Builder();
36 ARROW_RETURN_NOT_OK(builder.Append(0));
37 int32_t last_val = 0;
38 for (int32_t i = 0; i < num_rows_; ++i) {
39 last_val += d(gen_);
40 ARROW_RETURN_NOT_OK(builder.Append(last_val));
41 }
42 ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
43
44 // Since children of list has a new length, will use a new generator
45 RandomBatchGenerator value_gen(arrow::schema({arrow::field("x", type.value_type())}));
46 // Last index from the offsets array becomes the length of the sub-array
47 ARROW_ASSIGN_OR_RAISE(auto inner_batch, value_gen.Generate(last_val));
48 std::shared_ptr<arrow::Array> values = inner_batch->column(0);
49
50 ARROW_ASSIGN_OR_RAISE(auto array,
51 arrow::ListArray::FromArrays(*offsets.get(), *values.get()));
52 arrays_.push_back(array);
53
54 return arrow::Status::OK();
55 }
56
57 protected:
58 std::random_device rd_{};
59 std::mt19937 gen_{rd_()};
60 std::vector<std::shared_ptr<arrow::Array>> arrays_;
61 int32_t num_rows_;
62}; // RandomBatchGenerator
Given such a generator, you can create random test data for any supported schema:
std::shared_ptr<arrow::Schema> schema =
arrow::schema({arrow::field("x", arrow::float64()),
arrow::field("y", arrow::list(arrow::float64()))});
RandomBatchGenerator generator(schema);
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch, generator.Generate(5));
rout << "Created batch: \n" << batch->ToString();
// Consider using ValidateFull to check correctness
ARROW_RETURN_NOT_OK(batch->ValidateFull());
Created batch:
x: [
5.980843530854203,
4.435859807531756,
1.5717146136471953,
6.408427191655361,
6.114639401815783
]
y: [
[
7.5139046109461995,
3.901458906361901
],
[
1.8043934969186881,
7.382034290325901,
6.9025437669738485,
8.593687075297465,
6.628949982902424
],
[
3.6824264586496627,
3.5311511405892575,
7.5748857903728695,
4.3317042358393785
],
[
1.973520262528361,
7.481697765977915,
5.647800307266349
],
[
5.661277366074096,
3.557105116960675,
3.663922594248059,
1.8091739898497332,
6.8476343672974584
]
]