20#ifndef ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_
21#define ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_
25#include "common_defs.hpp"
29inline array_of_strings default_array_of_strings_update_policy::create()
const {
30 return array_of_strings(0,
"");
33inline void default_array_of_strings_update_policy::update(
34 array_of_strings& array,
const array_of_strings& input
36 const auto length =
static_cast<size_t>(input.size());
37 array = array_of_strings(
static_cast<uint8_t
>(length),
"");
38 for (
size_t i = 0; i < length; ++i) array[i] = input[i];
41inline void default_array_of_strings_update_policy::update(
42 array_of_strings& array,
const array_of_strings* input
44 if (input ==
nullptr) {
45 array = array_of_strings(0,
"");
48 const auto length =
static_cast<size_t>(input->size());
49 array = array_of_strings(
static_cast<uint8_t
>(length),
"");
50 for (
size_t i = 0; i < length; ++i) array[i] = (*input)[i];
55 static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL;
56 XXHash64 hasher(STRING_ARR_HASH_SEED);
57 const auto size =
static_cast<size_t>(key.size());
58 for (
size_t i = 0; i < size; ++i) {
59 const auto& entry = key[i];
60 hasher.
add(entry.data(), entry.size());
61 if (i + 1 < size) hasher.
add(
",", 1);
66template<
typename Allocator,
typename Policy>
73template<
typename Allocator>
74template<
typename Sketch>
76 const Sketch& sketch,
bool ordered
77):
Base(sketch, ordered) {}
79template<
typename Allocator>
82): Base(std::move(base)) {}
84template<
typename Allocator>
85template<
typename SerDe>
87 std::istream& is, uint64_t seed,
const SerDe& sd,
const Allocator& allocator
88) -> compact_array_of_strings_tuple_sketch {
89 auto base = Base::deserialize(is, seed, sd, allocator);
90 return compact_array_of_strings_tuple_sketch(std::move(base));
93template<
typename Allocator>
94template<
typename SerDe>
96 const void* bytes,
size_t size, uint64_t seed,
const SerDe& sd,
const Allocator& allocator
97) -> compact_array_of_strings_tuple_sketch {
98 auto base = Base::deserialize(bytes, size, seed, sd, allocator);
99 return compact_array_of_strings_tuple_sketch(std::move(base));
102template<
typename Allocator>
103default_array_of_strings_serde<Allocator>::default_array_of_strings_serde(
const Allocator& allocator):
104 summary_allocator_(allocator) {}
106template<
typename Allocator>
107void default_array_of_strings_serde<Allocator>::serialize(
108 std::ostream& os,
const array_of_strings* items,
unsigned num
112 for (; i < num; ++i) {
113 const uint32_t total_bytes = compute_total_bytes(items[i]);
114 const uint8_t num_nodes =
static_cast<uint8_t
>(items[i].size());
115 write(os, total_bytes);
116 write(os, num_nodes);
117 const std::string* data = items[i].data();
118 for (uint8_t j = 0; j < num_nodes; ++j) {
119 const uint32_t length =
static_cast<uint32_t
>(data[j].size());
121 os.write(data[j].data(), length);
124 }
catch (std::runtime_error& e) {
125 if (std::string(e.what()).find(
"size exceeds 127") != std::string::npos)
throw;
126 throw std::runtime_error(
"array_of_strings stream write failed at item " + std::to_string(i));
130template<
typename Allocator>
131void default_array_of_strings_serde<Allocator>::deserialize(
132 std::istream& is, array_of_strings* items,
unsigned num
135 bool failure =
false;
137 for (; i < num; ++i) {
139 if (!is) { failure =
true;
break; }
140 const uint8_t num_nodes = read<uint8_t>(is);
141 if (!is) { failure =
true;
break; }
142 check_num_nodes(num_nodes);
143 array_of_strings array(num_nodes,
"");
144 for (uint8_t j = 0; j < num_nodes; ++j) {
145 const uint32_t length = read<uint32_t>(is);
146 if (!is) { failure =
true;
break; }
147 std::string value(length,
'\0');
149 is.read(&value[0], length);
150 if (!is) { failure =
true;
break; }
152 array[j] = std::move(value);
155 summary_allocator alloc(summary_allocator_);
156 std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
158 }
catch (std::exception& e) {
159 summary_allocator alloc(summary_allocator_);
160 for (
unsigned j = 0; j < i; ++j) {
161 std::allocator_traits<summary_allocator>::destroy(alloc, &items[j]);
163 if (std::string(e.what()).find(
"size exceeds 127") != std::string::npos)
throw;
164 throw std::runtime_error(
"array_of_strings stream read failed at item " + std::to_string(i));
167 summary_allocator alloc(summary_allocator_);
168 for (
unsigned j = 0; j < i; ++j) {
169 std::allocator_traits<summary_allocator>::destroy(alloc, &items[j]);
171 throw std::runtime_error(
"array_of_strings stream read failed at item " + std::to_string(i));
175template<
typename Allocator>
176size_t default_array_of_strings_serde<Allocator>::serialize(
177 void* ptr,
size_t capacity,
const array_of_strings* items,
unsigned num
179 uint8_t* ptr8 =
static_cast<uint8_t*
>(ptr);
180 size_t bytes_written = 0;
182 for (
unsigned i = 0; i < num; ++i) {
183 const uint32_t total_bytes = compute_total_bytes(items[i]);
184 const uint8_t num_nodes =
static_cast<uint8_t
>(items[i].size());
185 check_memory_size(bytes_written + total_bytes, capacity);
186 bytes_written += copy_to_mem(total_bytes, ptr8 + bytes_written);
187 bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written);
188 const std::string* data = items[i].data();
189 for (uint8_t j = 0; j < num_nodes; ++j) {
190 const uint32_t length =
static_cast<uint32_t
>(data[j].size());
192 bytes_written += copy_to_mem(length, ptr8 + bytes_written);
193 bytes_written += copy_to_mem(data[j].data(), ptr8 + bytes_written, length);
196 return bytes_written;
199template<
typename Allocator>
200size_t default_array_of_strings_serde<Allocator>::deserialize(
201 const void* ptr,
size_t capacity, array_of_strings* items,
unsigned num
203 const uint8_t* ptr8 =
static_cast<const uint8_t*
>(ptr);
204 size_t bytes_read = 0;
209 for (; i < num; ++i) {
210 check_memory_size(bytes_read +
sizeof(uint32_t), capacity);
211 const size_t item_start = bytes_read;
212 uint32_t total_bytes;
213 bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes);
214 check_memory_size(item_start + total_bytes, capacity);
216 check_memory_size(bytes_read +
sizeof(uint8_t), capacity);
218 bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes);
219 check_num_nodes(num_nodes);
221 array_of_strings array(num_nodes,
"");
222 for (uint8_t j = 0; j < num_nodes; ++j) {
223 check_memory_size(bytes_read +
sizeof(uint32_t), capacity);
225 bytes_read += copy_from_mem(ptr8 + bytes_read, length);
227 check_memory_size(bytes_read + length, capacity);
228 std::string value(length,
'\0');
230 bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length);
232 array[j] = std::move(value);
234 summary_allocator alloc(summary_allocator_);
235 std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
237 }
catch (std::exception& e) {
238 summary_allocator alloc(summary_allocator_);
239 for (
unsigned j = 0; j < i; ++j) {
240 std::allocator_traits<summary_allocator>::destroy(alloc, &items[j]);
242 if (std::string(e.what()).find(
"size exceeds 127") != std::string::npos)
throw;
243 throw std::runtime_error(
"array_of_strings bytes read failed at item " + std::to_string(i));
248template<
typename Allocator>
249size_t default_array_of_strings_serde<Allocator>::size_of_item(
const array_of_strings& item)
const {
250 return compute_total_bytes(item);
253template<
typename Allocator>
254void default_array_of_strings_serde<Allocator>::check_num_nodes(uint8_t num_nodes) {
255 if (num_nodes > 127) {
256 throw std::runtime_error(
"array_of_strings size exceeds 127");
260template<
typename Allocator>
261uint32_t default_array_of_strings_serde<Allocator>::compute_total_bytes(
const array_of_strings& item) {
262 const auto count = item.size();
263 check_num_nodes(
static_cast<uint8_t
>(count));
264 size_t total =
sizeof(uint32_t) +
sizeof(uint8_t) + count *
sizeof(uint32_t);
265 const std::string* data = item.data();
266 for (uint32_t j = 0; j < count; ++j) {
267 total += data[j].size();
269 return static_cast<uint32_t
>(total);
XXHash (64 bit), based on Yann Collet's descriptions, see http://cyan4973.github.io/xxHash/.
Definition xxhash64.h:25
uint64_t hash() const
get current hash
Definition xxhash64.h:100
bool add(const void *input, uint64_t length)
add a chunk of bytes
Definition xxhash64.h:43
Extended class of compact_tuple_sketch for array of strings.
Definition array_of_strings_sketch.hpp:94
static compact_array_of_strings_tuple_sketch deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const SerDe &sd=SerDe(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
compact_array_of_strings_tuple_sketch(const Sketch &sketch, bool ordered=true)
Copy constructor.
Definition array_of_strings_sketch_impl.hpp:75
Compact Tuple sketch.
Definition tuple_sketch.hpp:457
Update Tuple sketch.
Definition tuple_sketch.hpp:222
DataSketches namespace.
Definition binomial_bounds.hpp:38
compact_array_of_strings_tuple_sketch< Allocator > compact_array_of_strings_sketch(const update_array_of_strings_tuple_sketch< Allocator, Policy > &sketch, bool ordered=true)
Converts an array of strings tuple sketch to a compact sketch (ordered or unordered).
Definition array_of_strings_sketch_impl.hpp:67
uint64_t hash_array_of_strings_key(const array_of_strings &key)
Hashes an array of strings using ArrayOfStrings-compatible hashing.
Definition array_of_strings_sketch_impl.hpp:53