23 #include "binomial_bounds.hpp"
24 #include "theta_helpers.hpp"
28 template<
typename S,
typename A>
33 template<
typename S,
typename A>
35 return static_cast<double>(get_theta64()) /
39 template<
typename S,
typename A>
41 return get_num_retained() / get_theta();
44 template<
typename S,
typename A>
46 num_subset_entries = std::min(num_subset_entries, get_num_retained()) ;
47 if (!is_estimation_mode())
return num_subset_entries;
48 return binomial_bounds::get_lower_bound(num_subset_entries, get_theta(), num_std_devs);
51 template<
typename S,
typename A>
53 return get_lower_bound(num_std_devs, get_num_retained()) ;
56 template<
typename S,
typename A>
58 num_subset_entries = std::min(num_subset_entries, get_num_retained()) ;
59 if (!is_estimation_mode())
return num_subset_entries;
60 return binomial_bounds::get_upper_bound(num_subset_entries, get_theta(), num_std_devs);
63 template<
typename S,
typename A>
65 return get_upper_bound(num_std_devs, get_num_retained()) ;
68 template<
typename S,
typename A>
72 std::ostringstream os;
73 os <<
"### Tuple sketch summary:" << std::endl;
74 os <<
" num retained entries : " << get_num_retained() << std::endl;
75 os <<
" seed hash : " << get_seed_hash() << std::endl;
76 os <<
" empty? : " << (is_empty() ?
"true" :
"false") << std::endl;
77 os <<
" ordered? : " << (is_ordered() ?
"true" :
"false") << std::endl;
78 os <<
" estimation mode? : " << (is_estimation_mode() ?
"true" :
"false") << std::endl;
79 os <<
" theta (fraction) : " << get_theta() << std::endl;
80 os <<
" theta (raw 64-bit) : " << get_theta64() << std::endl;
81 os <<
" estimate : " << this->get_estimate() << std::endl;
82 os <<
" lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
83 os <<
" upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
85 os <<
"### End sketch summary" << std::endl;
87 os <<
"### Retained entries" << std::endl;
88 for (
const auto& it: *
this) {
89 os << it.first <<
": " << it.second << std::endl;
91 os <<
"### End retained entries" << std::endl;
93 return string<A>(os.str().c_str(), get_allocator());
98 template<
typename S,
typename U,
typename P,
typename A>
99 update_tuple_sketch<S, U, P, A>::update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
float p, uint64_t theta, uint64_t seed,
const P& policy,
const A& allocator):
101 map_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
104 template<
typename S,
typename U,
typename P,
typename A>
106 return map_.allocator_;
109 template<
typename S,
typename U,
typename P,
typename A>
111 return map_.is_empty_;
114 template<
typename S,
typename U,
typename P,
typename A>
116 return map_.num_entries_ > 1 ? false :
true;;
119 template<
typename S,
typename U,
typename P,
typename A>
121 return is_empty() ? theta_constants::MAX_THETA : map_.theta_;
124 template<
typename S,
typename U,
typename P,
typename A>
126 return map_.num_entries_;
129 template<
typename S,
typename U,
typename P,
typename A>
131 return compute_seed_hash(map_.seed_);
134 template<
typename S,
typename U,
typename P,
typename A>
136 return map_.lg_nom_size_;
139 template<
typename S,
typename U,
typename P,
typename A>
144 template<
typename S,
typename U,
typename P,
typename A>
145 template<
typename UU>
147 update(&key,
sizeof(key), std::forward<UU>(value));
150 template<
typename S,
typename U,
typename P,
typename A>
151 template<
typename UU>
152 void update_tuple_sketch<S, U, P, A>::update(int64_t key, UU&& value) {
153 update(&key,
sizeof(key), std::forward<UU>(value));
156 template<
typename S,
typename U,
typename P,
typename A>
157 template<
typename UU>
159 update(
static_cast<int32_t
>(key), std::forward<UU>(value));
162 template<
typename S,
typename U,
typename P,
typename A>
163 template<
typename UU>
165 update(
static_cast<int64_t
>(key), std::forward<UU>(value));
168 template<
typename S,
typename U,
typename P,
typename A>
169 template<
typename UU>
170 void update_tuple_sketch<S, U, P, A>::update(uint16_t key, UU&& value) {
171 update(
static_cast<int16_t
>(key), std::forward<UU>(value));
174 template<
typename S,
typename U,
typename P,
typename A>
175 template<
typename UU>
176 void update_tuple_sketch<S, U, P, A>::update(int16_t key, UU&& value) {
177 update(
static_cast<int64_t
>(key), std::forward<UU>(value));
180 template<
typename S,
typename U,
typename P,
typename A>
181 template<
typename UU>
182 void update_tuple_sketch<S, U, P, A>::update(uint8_t key, UU&& value) {
183 update(
static_cast<int8_t
>(key), std::forward<UU>(value));
186 template<
typename S,
typename U,
typename P,
typename A>
187 template<
typename UU>
188 void update_tuple_sketch<S, U, P, A>::update(int8_t key, UU&& value) {
189 update(
static_cast<int64_t
>(key), std::forward<UU>(value));
192 template<
typename S,
typename U,
typename P,
typename A>
193 template<
typename UU>
194 void update_tuple_sketch<S, U, P, A>::update(
const std::string& key, UU&& value) {
195 if (key.empty())
return;
196 update(key.c_str(), key.length(), std::forward<UU>(value));
199 template<
typename S,
typename U,
typename P,
typename A>
200 template<
typename UU>
201 void update_tuple_sketch<S, U, P, A>::update(
double key, UU&& value) {
202 update(canonical_double(key), std::forward<UU>(value));
205 template<
typename S,
typename U,
typename P,
typename A>
206 template<
typename UU>
207 void update_tuple_sketch<S, U, P, A>::update(
float key, UU&& value) {
208 update(
static_cast<double>(key), std::forward<UU>(value));
211 template<
typename S,
typename U,
typename P,
typename A>
212 template<
typename UU>
213 void update_tuple_sketch<S, U, P, A>::update(
const void* key,
size_t length, UU&& value) {
214 const uint64_t hash = map_.hash_and_screen(key, length);
215 if (hash == 0)
return;
216 auto result = map_.find(hash);
217 if (!result.second) {
218 S summary = policy_.create();
219 policy_.update(summary, std::forward<UU>(value));
220 map_.insert(result.first, Entry(hash, std::move(summary)));
222 policy_.update((*result.first).second, std::forward<UU>(value));
226 template<
typename S,
typename U,
typename P,
typename A>
231 template<
typename S,
typename U,
typename P,
typename A>
236 template<
typename S,
typename U,
typename P,
typename A>
238 return iterator(map_.entries_, 1 << map_.lg_cur_size_, 0);
241 template<
typename S,
typename U,
typename P,
typename A>
243 return iterator(
nullptr, 0, 1 << map_.lg_cur_size_);
246 template<
typename S,
typename U,
typename P,
typename A>
248 return const_iterator(map_.entries_, 1 << map_.lg_cur_size_, 0);
251 template<
typename S,
typename U,
typename P,
typename A>
253 return const_iterator(
nullptr, 0, 1 << map_.lg_cur_size_);
256 template<
typename S,
typename U,
typename P,
typename A>
261 template<
typename S,
typename U,
typename P,
typename A>
263 os <<
" lg nominal size : " << (int) map_.lg_nom_size_ << std::endl;
264 os <<
" lg current size : " << (
int) map_.lg_cur_size_ << std::endl;
265 os <<
" resize factor : " << (1 << map_.rf_) << std::endl;
270 template<
typename S,
typename A>
271 compact_tuple_sketch<S, A>::compact_tuple_sketch(
bool is_empty,
bool is_ordered, uint16_t seed_hash, uint64_t theta,
272 std::vector<Entry, AllocEntry>&& entries):
274 is_ordered_(is_ordered || (entries.size() <= 1ULL)),
275 seed_hash_(seed_hash),
277 entries_(std::move(entries))
280 template<
typename S,
typename A>
282 is_empty_(other.is_empty()),
283 is_ordered_(other.is_ordered() || ordered),
284 seed_hash_(other.get_seed_hash()),
285 theta_(other.get_theta64()),
286 entries_(other.get_allocator())
289 std::copy(other.
begin(), other.
end(), std::back_inserter(entries_));
290 if (ordered && !other.
is_ordered()) std::sort(entries_.begin(), entries_.end(), comparator());
293 template<
typename S,
typename A>
295 is_empty_(other.is_empty()),
296 is_ordered_(other.is_ordered()),
297 seed_hash_(other.get_seed_hash()),
298 theta_(other.get_theta64()),
299 entries_(std::move(other.entries_))
302 template<
typename S,
typename A>
304 is_empty_(other.is_empty()),
305 is_ordered_(other.is_ordered() || ordered),
306 seed_hash_(other.get_seed_hash()),
307 theta_(other.get_theta64()),
308 entries_(other.get_allocator())
311 for (uint64_t hash: other) {
312 entries_.push_back(Entry(hash, summary));
314 if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end(), comparator());
317 template<
typename S,
typename A>
319 return entries_.get_allocator();
322 template<
typename S,
typename A>
327 template<
typename S,
typename A>
332 template<
typename S,
typename A>
337 template<
typename S,
typename A>
339 return static_cast<uint32_t
>(entries_.size());
342 template<
typename S,
typename A>
348 template<
typename S,
typename A>
349 template<typename SD, typename SS, typename std::enable_if<std::is_arithmetic<SS>::value,
int>::type>
352 return entries_.size() *
sizeof(SS);
356 template<
typename S,
typename A>
357 template<typename SD, typename SS, typename std::enable_if<!std::is_arithmetic<SS>::value,
int>::type>
360 for (
const auto& it: entries_) {
361 size += sd.size_of_item(it.second);
366 template<
typename S,
typename A>
367 template<
typename SerDe>
369 const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
370 write(os, preamble_longs);
371 const uint8_t serial_version = SERIAL_VERSION;
372 write(os, serial_version);
373 const uint8_t family = SKETCH_FAMILY;
375 const uint8_t type = SKETCH_TYPE;
377 const uint8_t unused8 = 0;
379 const uint8_t flags_byte(
380 (1 << flags::IS_COMPACT) |
381 (1 << flags::IS_READ_ONLY) |
382 (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
383 (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
385 write(os, flags_byte);
386 const uint16_t seed_hash = get_seed_hash();
387 write(os, seed_hash);
388 if (preamble_longs > 1) {
389 const uint32_t num_entries =
static_cast<uint32_t
>(entries_.size());
390 write(os, num_entries);
391 const uint32_t unused32 = 0;
394 if (this->is_estimation_mode()) {
395 write(os, this->theta_);
397 for (
const auto& it: entries_) {
399 sd.serialize(os, &it.second, 1);
403 template<
typename S,
typename A>
404 template<
typename SerDe>
406 const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
407 const size_t size = header_size_bytes +
sizeof(uint64_t) * preamble_longs
408 +
sizeof(uint64_t) * entries_.size() + get_serialized_size_summaries_bytes(sd);
409 vector_bytes bytes(size, 0, entries_.get_allocator());
410 uint8_t* ptr = bytes.data() + header_size_bytes;
411 const uint8_t* end_ptr = ptr + size;
413 ptr += copy_to_mem(preamble_longs, ptr);
414 const uint8_t serial_version = SERIAL_VERSION;
415 ptr += copy_to_mem(serial_version, ptr);
416 const uint8_t family = SKETCH_FAMILY;
417 ptr += copy_to_mem(family, ptr);
418 const uint8_t type = SKETCH_TYPE;
419 ptr += copy_to_mem(type, ptr);
420 ptr +=
sizeof(uint8_t);
421 const uint8_t flags_byte(
422 (1 << flags::IS_COMPACT) |
423 (1 << flags::IS_READ_ONLY) |
424 (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
425 (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
427 ptr += copy_to_mem(flags_byte, ptr);
428 const uint16_t seed_hash = get_seed_hash();
429 ptr += copy_to_mem(seed_hash, ptr);
430 if (preamble_longs > 1) {
431 const uint32_t num_entries =
static_cast<uint32_t
>(entries_.size());
432 ptr += copy_to_mem(num_entries, ptr);
433 ptr +=
sizeof(uint32_t);
435 if (this->is_estimation_mode()) {
436 ptr += copy_to_mem(theta_, ptr);
438 for (
const auto& it: entries_) {
439 ptr += copy_to_mem(it.first, ptr);
440 ptr += sd.serialize(ptr, end_ptr - ptr, &it.second, 1);
445 template<
typename S,
typename A>
446 template<
typename SerDe>
448 const auto preamble_longs = read<uint8_t>(is);
449 const auto serial_version = read<uint8_t>(is);
450 const auto family = read<uint8_t>(is);
451 const auto type = read<uint8_t>(is);
453 const auto flags_byte = read<uint8_t>(is);
454 const auto seed_hash = read<uint16_t>(is);
455 if (serial_version != SERIAL_VERSION && serial_version != SERIAL_VERSION_LEGACY) {
456 throw std::invalid_argument(
"serial version mismatch: expected " + std::to_string(SERIAL_VERSION) +
" or "
457 + std::to_string(SERIAL_VERSION_LEGACY) +
", actual " + std::to_string(serial_version));
459 checker<true>::check_sketch_family(family, SKETCH_FAMILY);
460 if (type != SKETCH_TYPE && type != SKETCH_TYPE_LEGACY) {
461 throw std::invalid_argument(
"sketch type mismatch: expected " + std::to_string(SKETCH_TYPE) +
" or "
462 + std::to_string(SKETCH_TYPE_LEGACY) +
", actual " + std::to_string(type));
464 const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
465 if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
468 uint32_t num_entries = 0;
470 if (preamble_longs == 1) {
473 num_entries = read<uint32_t>(is);
475 if (preamble_longs > 2) {
476 theta = read<uint64_t>(is);
481 std::vector<Entry, AllocEntry> entries(alloc);
483 entries.reserve(num_entries);
484 std::unique_ptr<S, deleter_of_summaries> summary(alloc.allocate(1), deleter_of_summaries(1,
false, allocator));
485 for (
size_t i = 0; i < num_entries; ++i) {
486 const auto key = read<uint64_t>(is);
487 sd.deserialize(is, summary.get(), 1);
488 entries.push_back(Entry(key, std::move(*summary)));
492 if (!is.good())
throw std::runtime_error(
"error reading from std::istream");
493 const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
494 return compact_tuple_sketch(is_empty, is_ordered, seed_hash, theta, std::move(entries));
497 template<
typename S,
typename A>
498 template<
typename SerDe>
500 ensure_minimum_memory(size, 8);
501 const char* ptr =
static_cast<const char*
>(bytes);
502 const char* base = ptr;
503 uint8_t preamble_longs;
504 ptr += copy_from_mem(ptr, preamble_longs);
505 uint8_t serial_version;
506 ptr += copy_from_mem(ptr, serial_version);
508 ptr += copy_from_mem(ptr, family);
510 ptr += copy_from_mem(ptr, type);
511 ptr +=
sizeof(uint8_t);
513 ptr += copy_from_mem(ptr, flags_byte);
515 ptr += copy_from_mem(ptr, seed_hash);
516 if (serial_version != SERIAL_VERSION && serial_version != SERIAL_VERSION_LEGACY) {
517 throw std::invalid_argument(
"serial version mismatch: expected " + std::to_string(SERIAL_VERSION) +
" or "
518 + std::to_string(SERIAL_VERSION_LEGACY) +
", actual " + std::to_string(serial_version));
520 checker<true>::check_sketch_family(family, SKETCH_FAMILY);
521 if (type != SKETCH_TYPE && type != SKETCH_TYPE_LEGACY) {
522 throw std::invalid_argument(
"sketch type mismatch: expected " + std::to_string(SKETCH_TYPE) +
" or "
523 + std::to_string(SKETCH_TYPE_LEGACY) +
", actual " + std::to_string(type));
525 const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
526 if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
529 uint32_t num_entries = 0;
532 if (preamble_longs == 1) {
535 ensure_minimum_memory(size, 8);
536 ptr += copy_from_mem(ptr, num_entries);
537 ptr +=
sizeof(uint32_t);
538 if (preamble_longs > 2) {
539 ensure_minimum_memory(size, (preamble_longs - 1) << 3);
540 ptr += copy_from_mem(ptr, theta);
544 const size_t keys_size_bytes =
sizeof(uint64_t) * num_entries;
545 ensure_minimum_memory(size, ptr - base + keys_size_bytes);
547 std::vector<Entry, AllocEntry> entries(alloc);
549 entries.reserve(num_entries);
550 std::unique_ptr<S, deleter_of_summaries> summary(alloc.allocate(1), deleter_of_summaries(1,
false, allocator));
551 for (
size_t i = 0; i < num_entries; ++i) {
553 ptr += copy_from_mem(ptr, key);
554 ptr += sd.deserialize(ptr, base + size - ptr, summary.get(), 1);
555 entries.push_back(Entry(key, std::move(*summary)));
559 const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
560 return compact_tuple_sketch(is_empty, is_ordered, seed_hash, theta, std::move(entries));
563 template<
typename S,
typename A>
565 return iterator(entries_.data(),
static_cast<uint32_t
>(entries_.size()), 0);
568 template<
typename S,
typename A>
570 return iterator(
nullptr, 0,
static_cast<uint32_t
>(entries_.size()));
573 template<
typename S,
typename A>
575 return const_iterator(entries_.data(),
static_cast<uint32_t
>(entries_.size()), 0);
578 template<
typename S,
typename A>
580 return const_iterator(
nullptr, 0,
static_cast<uint32_t
>(entries_.size()));
583 template<
typename S,
typename A>
588 template<
typename D,
typename P,
typename A>
589 tuple_base_builder<D, P, A>::tuple_base_builder(
const P& policy,
const A& allocator):
590 theta_base_builder<D, A>(allocator), policy_(policy) {}
592 template<
typename S,
typename U,
typename P,
typename A>
596 template<
typename S,
typename U,
typename P,
typename A>
598 return update_tuple_sketch(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
virtual uint32_t get_num_retained() const =0
Compact Tuple sketch.
Definition: tuple_sketch.hpp:407
virtual uint64_t get_theta64() const
Definition: tuple_sketch_impl.hpp:333
virtual uint32_t get_num_retained() const
Definition: tuple_sketch_impl.hpp:338
void serialize(std::ostream &os, const SerDe &sd=SerDe()) const
This method serializes the sketch into a given stream in a binary form.
Definition: tuple_sketch_impl.hpp:368
virtual bool is_empty() const
Definition: tuple_sketch_impl.hpp:323
virtual bool is_ordered() const
Definition: tuple_sketch_impl.hpp:328
virtual uint16_t get_seed_hash() const
Definition: tuple_sketch_impl.hpp:343
virtual iterator end()
Iterator pointing past the valid range.
Definition: tuple_sketch_impl.hpp:569
compact_tuple_sketch(const Base &other, bool ordered)
Copy constructor.
Definition: tuple_sketch_impl.hpp:281
virtual Allocator get_allocator() const
Definition: tuple_sketch_impl.hpp:318
static compact_tuple_sketch deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const SerDe &sd=SerDe(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
virtual iterator begin()
Iterator over entries in this sketch.
Definition: tuple_sketch_impl.hpp:564
size_t get_serialized_size_summaries_bytes(const SerDe &sd) const
Computes size needed to serialize summaries in the sketch.
Base class for the Theta Sketch, a generalization of the Kth Minimum Value (KMV) sketch.
Definition: theta_sketch.hpp:127
Tuple base builder.
Definition: tuple_sketch.hpp:587
Base class for Tuple sketch.
Definition: tuple_sketch.hpp:54
double get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const
Returns the approximate upper error bound given a number of standard deviations over an arbitrary num...
Definition: tuple_sketch_impl.hpp:57
double get_estimate() const
Definition: tuple_sketch_impl.hpp:40
virtual bool is_ordered() const =0
double get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const
Returns the approximate lower error bound given a number of standard deviations over an arbitrary num...
Definition: tuple_sketch_impl.hpp:45
string< Allocator > to_string(bool print_items=false) const
Provides a human-readable summary of this sketch as a string.
Definition: tuple_sketch_impl.hpp:69
virtual uint32_t get_num_retained() const =0
double get_theta() const
Definition: tuple_sketch_impl.hpp:34
virtual iterator end()=0
Iterator pointing past the valid range.
virtual iterator begin()=0
Iterator over entries in this sketch.
bool is_estimation_mode() const
Definition: tuple_sketch_impl.hpp:29
Update Tuple sketch builder.
Definition: tuple_sketch.hpp:597
builder(const P &policy=P(), const A &allocator=A())
Constructor Creates and instance of the builder with default parameters.
Definition: tuple_sketch_impl.hpp:593
update_tuple_sketch< S, U, P, A > build() const
This is to create an instance of the sketch with predefined parameters.
Definition: tuple_sketch_impl.hpp:597
Update Tuple sketch.
Definition: tuple_sketch.hpp:217
void trim()
Remove retained entries in excess of the nominal size k (if any)
Definition: tuple_sketch_impl.hpp:227
void reset()
Reset the sketch to the initial empty state.
Definition: tuple_sketch_impl.hpp:232
const uint64_t MAX_THETA
max theta - signed max for compatibility with Java
Definition: theta_constants.hpp:36
DataSketches namespace.
Definition: binomial_bounds.hpp:38