datasketches-cpp
Loading...
Searching...
No Matches
array_of_strings_sketch_impl.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_
21#define ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_
22
23#include <stdexcept>
24
25#include "common_defs.hpp"
26
27namespace datasketches {
28
29inline array_of_strings default_array_of_strings_update_policy::create() const {
30 return array_of_strings(0, "");
31}
32
33inline void default_array_of_strings_update_policy::update(
34 array_of_strings& array, const array_of_strings& input
35) const {
36 const auto length = static_cast<size_t>(input.size());
37 array = array_of_strings(static_cast<uint8_t>(length), "");
38 for (size_t i = 0; i < length; ++i) array[i] = input[i];
39}
40
41inline void default_array_of_strings_update_policy::update(
42 array_of_strings& array, const array_of_strings* input
43) const {
44 if (input == nullptr) {
45 array = array_of_strings(0, "");
46 return;
47 }
48 const auto length = static_cast<size_t>(input->size());
49 array = array_of_strings(static_cast<uint8_t>(length), "");
50 for (size_t i = 0; i < length; ++i) array[i] = (*input)[i];
51}
52
53inline uint64_t hash_array_of_strings_key(const array_of_strings& key) {
54 // Matches Java Util.PRIME for ArrayOfStrings key hashing.
55 static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL;
56 XXHash64 hasher(STRING_ARR_HASH_SEED);
57 const auto size = static_cast<size_t>(key.size());
58 for (size_t i = 0; i < size; ++i) {
59 const auto& entry = key[i];
60 hasher.add(entry.data(), entry.size());
61 if (i + 1 < size) hasher.add(",", 1);
62 }
63 return hasher.hash();
64}
65
66template<typename Allocator, typename Policy>
72
73template<typename Allocator>
74template<typename Sketch>
76 const Sketch& sketch, bool ordered
77): Base(sketch, ordered) {}
78
79template<typename Allocator>
81 Base&& base
82): Base(std::move(base)) {}
83
84template<typename Allocator>
85template<typename SerDe>
87 std::istream& is, uint64_t seed, const SerDe& sd, const Allocator& allocator
88) -> compact_array_of_strings_tuple_sketch {
89 auto base = Base::deserialize(is, seed, sd, allocator);
90 return compact_array_of_strings_tuple_sketch(std::move(base));
91}
92
93template<typename Allocator>
94template<typename SerDe>
96 const void* bytes, size_t size, uint64_t seed, const SerDe& sd, const Allocator& allocator
97) -> compact_array_of_strings_tuple_sketch {
98 auto base = Base::deserialize(bytes, size, seed, sd, allocator);
99 return compact_array_of_strings_tuple_sketch(std::move(base));
100}
101
102template<typename Allocator>
103default_array_of_strings_serde<Allocator>::default_array_of_strings_serde(const Allocator& allocator):
104 summary_allocator_(allocator) {}
105
106template<typename Allocator>
107void default_array_of_strings_serde<Allocator>::serialize(
108 std::ostream& os, const array_of_strings* items, unsigned num
109) const {
110 unsigned i = 0;
111 try {
112 for (; i < num; ++i) {
113 const uint32_t total_bytes = compute_total_bytes(items[i]);
114 const uint8_t num_nodes = static_cast<uint8_t>(items[i].size());
115 write(os, total_bytes);
116 write(os, num_nodes);
117 const std::string* data = items[i].data();
118 for (uint8_t j = 0; j < num_nodes; ++j) {
119 const uint32_t length = static_cast<uint32_t>(data[j].size());
120 write(os, length);
121 os.write(data[j].data(), length);
122 }
123 }
124 } catch (std::runtime_error& e) {
125 if (std::string(e.what()).find("size exceeds 127") != std::string::npos) throw;
126 throw std::runtime_error("array_of_strings stream write failed at item " + std::to_string(i));
127 }
128}
129
130template<typename Allocator>
131void default_array_of_strings_serde<Allocator>::deserialize(
132 std::istream& is, array_of_strings* items, unsigned num
133) const {
134 unsigned i = 0;
135 bool failure = false;
136 try {
137 for (; i < num; ++i) {
138 read<uint32_t>(is); // total_bytes
139 if (!is) { failure = true; break; }
140 const uint8_t num_nodes = read<uint8_t>(is);
141 if (!is) { failure = true; break; }
142 check_num_nodes(num_nodes);
143 array_of_strings array(num_nodes, "");
144 for (uint8_t j = 0; j < num_nodes; ++j) {
145 const uint32_t length = read<uint32_t>(is);
146 if (!is) { failure = true; break; }
147 std::string value(length, '\0');
148 if (length != 0) {
149 is.read(&value[0], length);
150 if (!is) { failure = true; break; }
151 }
152 array[j] = std::move(value);
153 }
154 if (failure) break;
155 summary_allocator alloc(summary_allocator_);
156 std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
157 }
158 } catch (std::exception& e) {
159 summary_allocator alloc(summary_allocator_);
160 for (unsigned j = 0; j < i; ++j) {
161 std::allocator_traits<summary_allocator>::destroy(alloc, &items[j]);
162 }
163 if (std::string(e.what()).find("size exceeds 127") != std::string::npos) throw;
164 throw std::runtime_error("array_of_strings stream read failed at item " + std::to_string(i));
165 }
166 if (failure) {
167 summary_allocator alloc(summary_allocator_);
168 for (unsigned j = 0; j < i; ++j) {
169 std::allocator_traits<summary_allocator>::destroy(alloc, &items[j]);
170 }
171 throw std::runtime_error("array_of_strings stream read failed at item " + std::to_string(i));
172 }
173}
174
175template<typename Allocator>
176size_t default_array_of_strings_serde<Allocator>::serialize(
177 void* ptr, size_t capacity, const array_of_strings* items, unsigned num
178) const {
179 uint8_t* ptr8 = static_cast<uint8_t*>(ptr);
180 size_t bytes_written = 0;
181
182 for (unsigned i = 0; i < num; ++i) {
183 const uint32_t total_bytes = compute_total_bytes(items[i]);
184 const uint8_t num_nodes = static_cast<uint8_t>(items[i].size());
185 check_memory_size(bytes_written + total_bytes, capacity);
186 bytes_written += copy_to_mem(total_bytes, ptr8 + bytes_written);
187 bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written);
188 const std::string* data = items[i].data();
189 for (uint8_t j = 0; j < num_nodes; ++j) {
190 const uint32_t length = static_cast<uint32_t>(data[j].size());
191
192 bytes_written += copy_to_mem(length, ptr8 + bytes_written);
193 bytes_written += copy_to_mem(data[j].data(), ptr8 + bytes_written, length);
194 }
195 }
196 return bytes_written;
197}
198
199template<typename Allocator>
200size_t default_array_of_strings_serde<Allocator>::deserialize(
201 const void* ptr, size_t capacity, array_of_strings* items, unsigned num
202) const {
203 const uint8_t* ptr8 = static_cast<const uint8_t*>(ptr);
204 size_t bytes_read = 0;
205
206 unsigned i = 0;
207
208 try {
209 for (; i < num; ++i) {
210 check_memory_size(bytes_read + sizeof(uint32_t), capacity);
211 const size_t item_start = bytes_read;
212 uint32_t total_bytes;
213 bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes);
214 check_memory_size(item_start + total_bytes, capacity);
215
216 check_memory_size(bytes_read + sizeof(uint8_t), capacity);
217 uint8_t num_nodes;
218 bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes);
219 check_num_nodes(num_nodes);
220
221 array_of_strings array(num_nodes, "");
222 for (uint8_t j = 0; j < num_nodes; ++j) {
223 check_memory_size(bytes_read + sizeof(uint32_t), capacity);
224 uint32_t length;
225 bytes_read += copy_from_mem(ptr8 + bytes_read, length);
226
227 check_memory_size(bytes_read + length, capacity);
228 std::string value(length, '\0');
229 if (length != 0) {
230 bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length);
231 }
232 array[j] = std::move(value);
233 }
234 summary_allocator alloc(summary_allocator_);
235 std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
236 }
237 } catch (std::exception& e) {
238 summary_allocator alloc(summary_allocator_);
239 for (unsigned j = 0; j < i; ++j) {
240 std::allocator_traits<summary_allocator>::destroy(alloc, &items[j]);
241 }
242 if (std::string(e.what()).find("size exceeds 127") != std::string::npos) throw;
243 throw std::runtime_error("array_of_strings bytes read failed at item " + std::to_string(i));
244 }
245 return bytes_read;
246}
247
248template<typename Allocator>
249size_t default_array_of_strings_serde<Allocator>::size_of_item(const array_of_strings& item) const {
250 return compute_total_bytes(item);
251}
252
253template<typename Allocator>
254void default_array_of_strings_serde<Allocator>::check_num_nodes(uint8_t num_nodes) {
255 if (num_nodes > 127) {
256 throw std::runtime_error("array_of_strings size exceeds 127");
257 }
258}
259
260template<typename Allocator>
261uint32_t default_array_of_strings_serde<Allocator>::compute_total_bytes(const array_of_strings& item) {
262 const auto count = item.size();
263 check_num_nodes(static_cast<uint8_t>(count));
264 size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t);
265 const std::string* data = item.data();
266 for (uint32_t j = 0; j < count; ++j) {
267 total += data[j].size();
268 }
269 return static_cast<uint32_t>(total);
270}
271
272} /* namespace datasketches */
273
274#endif
XXHash (64 bit), based on Yann Collet's descriptions, see http://cyan4973.github.io/xxHash/.
Definition xxhash64.h:25
uint64_t hash() const
get current hash
Definition xxhash64.h:100
bool add(const void *input, uint64_t length)
add a chunk of bytes
Definition xxhash64.h:43
Extended class of compact_tuple_sketch for array of strings.
Definition array_of_strings_sketch.hpp:94
static compact_array_of_strings_tuple_sketch deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const SerDe &sd=SerDe(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
compact_array_of_strings_tuple_sketch(const Sketch &sketch, bool ordered=true)
Copy constructor.
Definition array_of_strings_sketch_impl.hpp:75
Compact Tuple sketch.
Definition tuple_sketch.hpp:457
Update Tuple sketch.
Definition tuple_sketch.hpp:222
DataSketches namespace.
Definition binomial_bounds.hpp:38
compact_array_of_strings_tuple_sketch< Allocator > compact_array_of_strings_sketch(const update_array_of_strings_tuple_sketch< Allocator, Policy > &sketch, bool ordered=true)
Converts an array of strings tuple sketch to a compact sketch (ordered or unordered).
Definition array_of_strings_sketch_impl.hpp:67
uint64_t hash_array_of_strings_key(const array_of_strings &key)
Hashes an array of strings using ArrayOfStrings-compatible hashing.
Definition array_of_strings_sketch_impl.hpp:53