Creating Arrow Objects

Recipes related to the creation of Arrays, Tables, Tensors and all other Arrow entities.

Create Arrays from Standard C++

Typed subclasses of arrow::ArrayBuilder make it easy to efficiently create Arrow arrays from existing C++ data:

Creating an array from C++ primitives
arrow::Int32Builder builder;
ARROW_RETURN_NOT_OK(builder.Append(1));
ARROW_RETURN_NOT_OK(builder.Append(2));
ARROW_RETURN_NOT_OK(builder.Append(3));
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> arr, builder.Finish())
rout << arr->ToString() << std::endl;
Code Output
[
  1,
  2,
  3
]

Note

Builders will allocate data as needed and insertion should have constant amortized time.

Builders can also consume standard C++ containers:

// Raw pointers
arrow::Int64Builder long_builder = arrow::Int64Builder();
std::array<int64_t, 4> values = {1, 2, 3, 4};
ARROW_RETURN_NOT_OK(long_builder.AppendValues(values.data(), values.size()));
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> arr, long_builder.Finish());
rout << arr->ToString() << std::endl;

// Vectors
arrow::StringBuilder str_builder = arrow::StringBuilder();
std::vector<std::string> strvals = {"x", "y", "z"};
ARROW_RETURN_NOT_OK(str_builder.AppendValues(strvals));
ARROW_ASSIGN_OR_RAISE(arr, str_builder.Finish());
rout << arr->ToString() << std::endl;

// Iterators
arrow::DoubleBuilder dbl_builder = arrow::DoubleBuilder();
std::set<double> dblvals = {1.1, 1.1, 2.3};
ARROW_RETURN_NOT_OK(dbl_builder.AppendValues(dblvals.begin(), dblvals.end()));
ARROW_ASSIGN_OR_RAISE(arr, dbl_builder.Finish());
rout << arr->ToString() << std::endl;
Code Output
[
  1,
  2,
  3,
  4
]
[
  "x",
  "y",
  "z"
]
[
  1.1,
  2.3
]

Note

Builders will not take ownership of data in containers and will make a copy of the underlying data.

Generate Random Data for a Given Schema

To generate random data for a given schema, implementing a type visitor is a good idea. The following example only implements double arrays and list arrays, but could be easily extended to all types.

Using visitor pattern to generate random record batches
 1class RandomBatchGenerator {
 2 public:
 3  std::shared_ptr<arrow::Schema> schema;
 4
 5  RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : schema(schema){};
 6
 7  arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t num_rows) {
 8    num_rows_ = num_rows;
 9    for (std::shared_ptr<arrow::Field> field : schema->fields()) {
10      ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
11    }
12
13    return arrow::RecordBatch::Make(schema, num_rows, arrays_);
14  }
15
16  // Default implementation
17  arrow::Status Visit(const arrow::DataType& type) {
18    return arrow::Status::NotImplemented("Generating data for", type.ToString());
19  }
20
21  arrow::Status Visit(const arrow::DoubleType&) {
22    auto builder = arrow::DoubleBuilder();
23    std::normal_distribution<> d{/*mean=*/5.0, /*stddev=*/2.0};
24    for (int32_t i = 0; i < num_rows_; ++i) {
25      ARROW_RETURN_NOT_OK(builder.Append(d(gen_)));
26    }
27    ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
28    arrays_.push_back(array);
29    return arrow::Status::OK();
30  }
31
32  arrow::Status Visit(const arrow::ListType& type) {
33    // Generate offsets first, which determines number of values in sub-array
34    std::poisson_distribution<> d{/*mean=*/4};
35    auto builder = arrow::Int32Builder();
36    ARROW_RETURN_NOT_OK(builder.Append(0));
37    int32_t last_val = 0;
38    for (int32_t i = 0; i < num_rows_; ++i) {
39      last_val += d(gen_);
40      ARROW_RETURN_NOT_OK(builder.Append(last_val));
41    }
42    ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
43
44    // Since children of list has a new length, will use a new generator
45    RandomBatchGenerator value_gen(arrow::schema({arrow::field("x", type.value_type())}));
46    // Last index from the offsets array becomes the length of the sub-array
47    ARROW_ASSIGN_OR_RAISE(auto inner_batch, value_gen.Generate(last_val));
48    std::shared_ptr<arrow::Array> values = inner_batch->column(0);
49
50    ARROW_ASSIGN_OR_RAISE(auto array,
51                          arrow::ListArray::FromArrays(*offsets.get(), *values.get()));
52    arrays_.push_back(array);
53
54    return arrow::Status::OK();
55  }
56
57 protected:
58  std::random_device rd_{};
59  std::mt19937 gen_{rd_()};
60  std::vector<std::shared_ptr<arrow::Array>> arrays_;
61  int32_t num_rows_;
62};  // RandomBatchGenerator

Given such a generator, you can create random test data for any supported schema:

std::shared_ptr<arrow::Schema> schema =
    arrow::schema({arrow::field("x", arrow::float64()),
                   arrow::field("y", arrow::list(arrow::float64()))});

RandomBatchGenerator generator(schema);
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch, generator.Generate(5));

rout << "Created batch: \n" << batch->ToString();

// Consider using ValidateFull to check correctness
ARROW_RETURN_NOT_OK(batch->ValidateFull());
Code Output
Created batch: 
x:   [
    4.430411238665037,
    1.1402567322890786,
    4.757906450297549,
    7.187500851186735,
    2.9767679068012853
  ]
y:   [
    [
      7.849290083736791,
      7.05431322008432,
      3.417650470075496,
      6.542268012615005,
      2.382945633558648,
      5.190510731957064
    ],
    [
      4.063498588931077,
      8.817756925510391,
      5.3547571198281325,
      6.778059481095653,
      6.530979651948345,
      5.120788794246223
    ],
    [
      6.399947715489706,
      5.02502018174901,
      5.331867600463648
    ],
    [
      6.574170475881674
    ],
    [
      3.204670046727827,
      8.072455895005092,
      2.0748510689797657,
      6.32513834819659,
      4.27734561602461,
      7.1846011751883765
    ]
  ]