datasketches-cpp
Loading...
Searching...
No Matches
theta_sketch_impl.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef THETA_SKETCH_IMPL_HPP_
21#define THETA_SKETCH_IMPL_HPP_
22
23#include <sstream>
24#include <vector>
25#include <stdexcept>
26
27#include "binomial_bounds.hpp"
28#include "theta_helpers.hpp"
29#include "count_zeros.hpp"
30#include "bit_packing.hpp"
31#include "memory_operations.hpp"
32
33namespace datasketches {
34
35template<typename A>
37 return get_theta64() < theta_constants::MAX_THETA && !is_empty();
38}
39
40template<typename A>
42 return static_cast<double>(get_theta64()) /
43 static_cast<double>(theta_constants::MAX_THETA);
44}
45
46template<typename A>
48 return get_num_retained() / get_theta();
49}
50
51template<typename A>
52double base_theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
53 if (!is_estimation_mode()) return get_num_retained();
54 return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
55}
56
57template<typename A>
58double base_theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
59 if (!is_estimation_mode()) return get_num_retained();
60 return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
61}
62
63template<typename A>
64string<A> base_theta_sketch_alloc<A>::to_string(bool print_details) const {
65 // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
66 // The stream does not support passing an allocator instance, and alternatives are complicated.
67 std::ostringstream os;
68 os << "### Theta sketch summary:" << std::endl;
69 os << " num retained entries : " << this->get_num_retained() << std::endl;
70 os << " seed hash : " << this->get_seed_hash() << std::endl;
71 os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
72 os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
73 os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
74 os << " theta (fraction) : " << this->get_theta() << std::endl;
75 os << " theta (raw 64-bit) : " << this->get_theta64() << std::endl;
76 os << " estimate : " << this->get_estimate() << std::endl;
77 os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
78 os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
79 print_specifics(os);
80 os << "### End sketch summary" << std::endl;
81 if (print_details) {
82 print_items(os);
83 }
84 return string<A>(os.str().c_str(), this->get_allocator());
85}
87template<typename A>
88void theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
89 os << "### Retained entries" << std::endl;
90 for (const auto& hash: *this) {
91 os << hash << std::endl;
92 }
93 os << "### End retained entries" << std::endl;
94}
95
96
97// update sketch
98
99template<typename A>
100update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
101 float p, uint64_t theta, uint64_t seed, const A& allocator):
102table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
103{}
104
105template<typename A>
107 return table_.allocator_;
108}
109
110template<typename A>
112 return table_.is_empty_;
113}
114
115template<typename A>
117 return table_.num_entries_ > 1 ? false : true;
119
120template<typename A>
122 return is_empty() ? theta_constants::MAX_THETA : table_.theta_;
123}
124
125template<typename A>
127 return table_.num_entries_;
128}
129
130template<typename A>
132 return compute_seed_hash(table_.seed_);
133}
134
135template<typename A>
137 return table_.lg_nom_size_;
138}
139
140template<typename A>
141auto update_theta_sketch_alloc<A>::get_rf() const -> resize_factor {
142 return table_.rf_;
143}
144
145template<typename A>
147 update(&value, sizeof(value));
148}
149
150template<typename A>
152 update(&value, sizeof(value));
153}
154
155template<typename A>
157 update(static_cast<int32_t>(value));
158}
159
160template<typename A>
162 update(static_cast<int64_t>(value));
163}
164
165template<typename A>
167 update(static_cast<int16_t>(value));
168}
169
170template<typename A>
172 update(static_cast<int64_t>(value));
173}
174
175template<typename A>
177 update(static_cast<int8_t>(value));
178}
179
180template<typename A>
182 update(static_cast<int64_t>(value));
183}
184
185template<typename A>
187 update(canonical_double(value));
188}
189
190template<typename A>
192 update(static_cast<double>(value));
193}
194
195template<typename A>
196void update_theta_sketch_alloc<A>::update(const std::string& value) {
197 if (value.empty()) return;
198 update(value.c_str(), value.length());
199}
200
201template<typename A>
202void update_theta_sketch_alloc<A>::update(const void* data, size_t length) {
203 const uint64_t hash = table_.hash_and_screen(data, length);
204 if (hash == 0) return;
205 auto result = table_.find(hash);
206 if (!result.second) {
207 table_.insert(result.first, hash);
208 }
209}
210
211template<typename A>
213 table_.trim();
214}
215
216template<typename A>
218 table_.reset();
219}
220
221template<typename A>
223 return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
224}
225
226template<typename A>
228 return iterator(nullptr, 0, 1 << table_.lg_cur_size_);
229}
230
231template<typename A>
232auto update_theta_sketch_alloc<A>::begin() const -> const_iterator {
233 return const_iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
234}
235
236template<typename A>
237auto update_theta_sketch_alloc<A>::end() const -> const_iterator {
238 return const_iterator(nullptr, 0, 1 << table_.lg_cur_size_);
239}
240
241template<typename A>
245
246template<typename A>
247void update_theta_sketch_alloc<A>::print_specifics(std::ostringstream& os) const {
248 os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
249 os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
250 os << " resize factor : " << (1 << table_.rf_) << std::endl;
251}
252
253// builder
254
255template<typename A>
257
258template<typename A>
260 return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
261}
262
263// compact sketch
264
265template<typename A>
266template<typename Other>
268is_empty_(other.is_empty()),
269is_ordered_(other.is_ordered() || ordered),
270seed_hash_(other.get_seed_hash()),
271theta_(other.get_theta64()),
272entries_(other.get_allocator())
273{
274 if (!other.is_empty()) {
275 entries_.reserve(other.get_num_retained());
276 std::copy(other.begin(), other.end(), std::back_inserter(entries_));
277 if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
278 }
279}
280
281template<typename A>
282compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
283 std::vector<uint64_t, A>&& entries):
284is_empty_(is_empty),
285is_ordered_(is_ordered || (entries.size() <= 1ULL)),
286seed_hash_(seed_hash),
287theta_(theta),
288entries_(std::move(entries))
289{}
290
291template<typename A>
293 return entries_.get_allocator();
294}
295
296template<typename A>
298 return is_empty_;
299}
300
301template<typename A>
303 return is_ordered_;
304}
305
306template<typename A>
308 return theta_;
309}
310
311template<typename A>
313 return static_cast<uint32_t>(entries_.size());
314}
315
316template<typename A>
318 return seed_hash_;
319}
320
321template<typename A>
323 return iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
324}
325
326template<typename A>
328 return iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
329}
330
331template<typename A>
332auto compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
333 return const_iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
334}
335
336template<typename A>
337auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
338 return const_iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
339}
340
341template<typename A>
342void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
343
344template<typename A>
345uint8_t compact_theta_sketch_alloc<A>::get_preamble_longs(bool compressed) const {
346 if (compressed) {
347 return this->is_estimation_mode() ? 2 : 1;
348 }
349 return this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
350}
351
352template<typename A>
354 return sizeof(uint64_t) * (3 + update_theta_sketch_alloc<A>::theta_table::get_capacity(lg_k + 1, lg_k));
355}
356
357template<typename A>
359 if (compressed && is_suitable_for_compression()) {
360 return get_compressed_serialized_size_bytes(compute_entry_bits(), get_num_entries_bytes());
361 }
362 return sizeof(uint64_t) * get_preamble_longs(false) + sizeof(uint64_t) * entries_.size();
363}
364
365// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
366template<typename A>
368 return whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
369}
370
371template<typename A>
372size_t compact_theta_sketch_alloc<A>::get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const {
373 const size_t compressed_bits = entry_bits * entries_.size();
374 return sizeof(uint64_t) * get_preamble_longs(true) + num_entries_bytes + whole_bytes_to_hold_bits(compressed_bits);
375}
376
377template<typename A>
378void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
379 const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
380 write(os, preamble_longs);
381 write(os, UNCOMPRESSED_SERIAL_VERSION);
382 write(os, SKETCH_TYPE);
383 write<uint16_t>(os, 0); // unused
384 const uint8_t flags_byte(
385 (1 << flags::IS_COMPACT) |
386 (1 << flags::IS_READ_ONLY) |
387 (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
388 (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
389 );
390 write(os, flags_byte);
391 write(os, get_seed_hash());
392 if (preamble_longs > 1) {
393 write(os, static_cast<uint32_t>(entries_.size()));
394 write<uint32_t>(os, 0); // unused
395 }
396 if (this->is_estimation_mode()) write(os, this->theta_);
397 if (entries_.size() > 0) write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
398}
399
400template<typename A>
401auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
402 const size_t size = get_serialized_size_bytes() + header_size_bytes;
403 vector_bytes bytes(size, 0, entries_.get_allocator());
404 uint8_t* ptr = bytes.data() + header_size_bytes;
405 const uint8_t preamble_longs = get_preamble_longs(false);
406 *ptr++ = preamble_longs;
407 *ptr++ = UNCOMPRESSED_SERIAL_VERSION;
408 *ptr++ = SKETCH_TYPE;
409 ptr += sizeof(uint16_t); // unused
410 const uint8_t flags_byte(
411 (1 << flags::IS_COMPACT) |
412 (1 << flags::IS_READ_ONLY) |
413 (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
414 (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
415 );
416 *ptr++ = flags_byte;
417 ptr += copy_to_mem(get_seed_hash(), ptr);
418 if (preamble_longs > 1) {
419 ptr += copy_to_mem(static_cast<uint32_t>(entries_.size()), ptr);
420 ptr += sizeof(uint32_t); // unused
421 }
422 if (this->is_estimation_mode()) ptr += copy_to_mem(theta_, ptr);
423 if (entries_.size() > 0) ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
424 return bytes;
425}
426
427template<typename A>
429 if (!this->is_ordered() || entries_.size() == 0 ||
430 (entries_.size() == 1 && !this->is_estimation_mode())) return false;
431 return true;
432}
433
434template<typename A>
436 if (is_suitable_for_compression()) return serialize_version_4(os);
437 return serialize(os);
438}
439
440template<typename A>
441auto compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_bytes) const -> vector_bytes {
442 if (is_suitable_for_compression()) return serialize_version_4(header_size_bytes);
443 return serialize(header_size_bytes);
444}
445
446template<typename A>
448 // compression is based on leading zeros in deltas between ordered hash values
449 // assumes ordered sketch
450 uint64_t previous = 0;
451 uint64_t ored = 0;
452 for (const uint64_t entry: entries_) {
453 const uint64_t delta = entry - previous;
454 ored |= delta;
455 previous = entry;
456 }
457 return 64 - count_leading_zeros_in_u64(ored);
458}
459
460template<typename A>
461void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const {
462 const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
463 const uint8_t entry_bits = compute_entry_bits();
464 const uint8_t num_entries_bytes = get_num_entries_bytes();
465
466 write(os, preamble_longs);
467 write(os, COMPRESSED_SERIAL_VERSION);
468 write(os, SKETCH_TYPE);
469 write(os, entry_bits);
470 write(os, num_entries_bytes);
471 const uint8_t flags_byte(
472 (1 << flags::IS_COMPACT) |
473 (1 << flags::IS_READ_ONLY) |
474 (1 << flags::IS_ORDERED)
475 );
476 write(os, flags_byte);
477 write(os, get_seed_hash());
478 if (this->is_estimation_mode()) write(os, this->theta_);
479 uint32_t num_entries = static_cast<uint32_t>(entries_.size());
480 for (unsigned i = 0; i < num_entries_bytes; ++i) {
481 write<uint8_t>(os, num_entries & 0xff);
482 num_entries >>= 8;
483 }
484
485 uint64_t previous = 0;
486 uint64_t deltas[8];
487 vector_bytes buffer(entry_bits, 0, entries_.get_allocator()); // block of 8 entries takes entry_bits bytes
488
489 // pack blocks of 8 deltas
490 unsigned i;
491 for (i = 0; i + 7 < entries_.size(); i += 8) {
492 for (unsigned j = 0; j < 8; ++j) {
493 deltas[j] = entries_[i + j] - previous;
494 previous = entries_[i + j];
495 }
496 pack_bits_block8(deltas, buffer.data(), entry_bits);
497 write(os, buffer.data(), buffer.size());
498 }
499
500 // pack extra deltas if fewer than 8 of them left
501 if (i < entries_.size()) {
502 uint8_t offset = 0;
503 uint8_t* ptr = buffer.data();
504 for (; i < entries_.size(); ++i) {
505 const uint64_t delta = entries_[i] - previous;
506 previous = entries_[i];
507 offset = pack_bits(delta, entry_bits, ptr, offset);
508 }
509 if (offset > 0) ++ptr;
510 write(os, buffer.data(), ptr - buffer.data());
511 }
512}
513
514template<typename A>
515auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_bytes) const -> vector_bytes {
516 const uint8_t entry_bits = compute_entry_bits();
517 const uint8_t num_entries_bytes = get_num_entries_bytes();
518 const size_t size = get_compressed_serialized_size_bytes(entry_bits, num_entries_bytes) + header_size_bytes;
519 vector_bytes bytes(size, 0, entries_.get_allocator());
520 uint8_t* ptr = bytes.data() + header_size_bytes;
521
522 *ptr++ = get_preamble_longs(true);
523 *ptr++ = COMPRESSED_SERIAL_VERSION;
524 *ptr++ = SKETCH_TYPE;
525 *ptr++ = entry_bits;
526 *ptr++ = num_entries_bytes;
527 const uint8_t flags_byte(
528 (1 << flags::IS_COMPACT) |
529 (1 << flags::IS_READ_ONLY) |
530 (1 << flags::IS_ORDERED)
531 );
532 *ptr++ = flags_byte;
533 ptr += copy_to_mem(get_seed_hash(), ptr);
534 if (this->is_estimation_mode()) {
535 ptr += copy_to_mem(theta_, ptr);
536 }
537 uint32_t num_entries = static_cast<uint32_t>(entries_.size());
538 for (unsigned i = 0; i < num_entries_bytes; ++i) {
539 *ptr++ = num_entries & 0xff;
540 num_entries >>= 8;
541 }
542
543 uint64_t previous = 0;
544 uint64_t deltas[8];
545
546 // pack blocks of 8 deltas
547 unsigned i;
548 for (i = 0; i + 7 < entries_.size(); i += 8) {
549 for (unsigned j = 0; j < 8; ++j) {
550 deltas[j] = entries_[i + j] - previous;
551 previous = entries_[i + j];
552 }
553 pack_bits_block8(deltas, ptr, entry_bits);
554 ptr += entry_bits;
555 }
556
557 // pack extra deltas if fewer than 8 of them left
558 uint8_t offset = 0;
559 for (; i < entries_.size(); ++i) {
560 const uint64_t delta = entries_[i] - previous;
561 previous = entries_[i];
562 offset = pack_bits(delta, entry_bits, ptr, offset);
563 }
564 return bytes;
565}
566
567template<typename A>
568compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
569 const auto preamble_longs = read<uint8_t>(is);
570 const auto serial_version = read<uint8_t>(is);
571 const auto type = read<uint8_t>(is);
572 checker<true>::check_sketch_type(type, SKETCH_TYPE);
573 switch (serial_version) {
574 case 4:
575 return deserialize_v4(preamble_longs, is, seed, allocator);
576 case 3:
577 return deserialize_v3(preamble_longs, is, seed, allocator);
578 case 1:
579 return deserialize_v1(preamble_longs, is, seed, allocator);
580 case 2:
581 return deserialize_v2(preamble_longs, is, seed, allocator);
582 default:
583 throw std::invalid_argument("unexpected sketch serialization version " + std::to_string(serial_version));
584 }
585}
586
587template<typename A>
588compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v1(
589 uint8_t, std::istream& is, uint64_t seed, const A& allocator)
590{
591 const auto seed_hash = compute_seed_hash(seed);
592 read<uint8_t>(is); // unused
593 read<uint32_t>(is); // unused
594 const auto num_entries = read<uint32_t>(is);
595 read<uint32_t>(is); //unused
596 const auto theta = read<uint64_t>(is);
597 std::vector<uint64_t, A> entries(num_entries, 0, allocator);
598 bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
599 if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
600 if (!is.good()) throw std::runtime_error("error reading from std::istream");
601 return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
602}
603
604template<typename A>
605compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v2(
606 uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
607{
608 read<uint8_t>(is); // unused
609 read<uint16_t>(is); // unused
610 const uint16_t seed_hash = read<uint16_t>(is);
611 checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
612 if (preamble_longs == 1) {
613 if (!is.good()) throw std::runtime_error("error reading from std::istream");
614 std::vector<uint64_t, A> entries(0, 0, allocator);
615 return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
616 } else if (preamble_longs == 2) {
617 const uint32_t num_entries = read<uint32_t>(is);
618 read<uint32_t>(is); // unused
619 std::vector<uint64_t, A> entries(num_entries, 0, allocator);
620 if (num_entries == 0) {
621 return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
622 }
623 read(is, entries.data(), entries.size() * sizeof(uint64_t));
624 if (!is.good()) throw std::runtime_error("error reading from std::istream");
625 return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
626 } else if (preamble_longs == 3) {
627 const uint32_t num_entries = read<uint32_t>(is);
628 read<uint32_t>(is); // unused
629 const auto theta = read<uint64_t>(is);
630 bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
631 std::vector<uint64_t, A> entries(num_entries, 0, allocator);
632 if (is_empty) {
633 if (!is.good()) throw std::runtime_error("error reading from std::istream");
634 return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
635 } else {
636 read(is, entries.data(), sizeof(uint64_t) * entries.size());
637 if (!is.good()) throw std::runtime_error("error reading from std::istream");
638 return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
639 }
640 } else {
641 throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
642 }
643}
644
645template<typename A>
646compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v3(
647 uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
648{
649 read<uint16_t>(is); // unused
650 const auto flags_byte = read<uint8_t>(is);
651 const auto seed_hash = read<uint16_t>(is);
652 const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
653 if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
654 uint64_t theta = theta_constants::MAX_THETA;
655 uint32_t num_entries = 0;
656 if (!is_empty) {
657 if (preamble_longs == 1) {
658 num_entries = 1;
659 } else {
660 num_entries = read<uint32_t>(is);
661 read<uint32_t>(is); // unused
662 if (preamble_longs > 2) theta = read<uint64_t>(is);
663 }
664 }
665 std::vector<uint64_t, A> entries(num_entries, 0, allocator);
666 if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
667 const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
668 if (!is.good()) throw std::runtime_error("error reading from std::istream");
669 return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
670}
671
672template<typename A>
673compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v4(
674 uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
675{
676 const auto entry_bits = read<uint8_t>(is);
677 const auto num_entries_bytes = read<uint8_t>(is);
678 const auto flags_byte = read<uint8_t>(is);
679 const auto seed_hash = read<uint16_t>(is);
680 const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
681 if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
682 uint64_t theta = theta_constants::MAX_THETA;
683 if (preamble_longs > 1) theta = read<uint64_t>(is);
684 uint32_t num_entries = 0;
685 for (unsigned i = 0; i < num_entries_bytes; ++i) {
686 num_entries |= read<uint8_t>(is) << (i << 3);
687 }
688 vector_bytes buffer(entry_bits, 0, allocator); // block of 8 entries takes entry_bits bytes
689 std::vector<uint64_t, A> entries(num_entries, 0, allocator);
690
691 // unpack blocks of 8 deltas
692 unsigned i;
693 for (i = 0; i + 7 < num_entries; i += 8) {
694 read(is, buffer.data(), buffer.size());
695 unpack_bits_block8(&entries[i], buffer.data(), entry_bits);
696 }
697 // unpack extra deltas if fewer than 8 of them left
698 if (i < num_entries) read(is, buffer.data(), whole_bytes_to_hold_bits((num_entries - i) * entry_bits));
699 if (!is.good()) throw std::runtime_error("error reading from std::istream");
700 const uint8_t* ptr = buffer.data();
701 uint8_t offset = 0;
702 for (; i < num_entries; ++i) {
703 offset = unpack_bits(entries[i], entry_bits, ptr, offset);
704 }
705 // undo deltas
706 uint64_t previous = 0;
707 for (i = 0; i < num_entries; ++i) {
708 entries[i] += previous;
709 previous = entries[i];
710 }
711 const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
712 return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
713}
714
715template<typename A>
716compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
717 auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, false);
718 if (data.entry_bits == 64) { // versions 1 to 3
719 const uint64_t* entries = reinterpret_cast<const uint64_t*>(data.entries_start_ptr);
720 return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta,
721 std::vector<uint64_t, A>(entries, entries + data.num_entries, allocator));
722 } else { // version 4
723 std::vector<uint64_t, A> entries(data.num_entries, 0, allocator);
724 const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data.entries_start_ptr);
725 // unpack blocks of 8 deltas
726 unsigned i;
727 for (i = 0; i + 7 < data.num_entries; i += 8) {
728 unpack_bits_block8(&entries[i], ptr, data.entry_bits);
729 ptr += data.entry_bits;
730 }
731 // unpack extra deltas if fewer than 8 of them left
732 uint8_t offset = 0;
733 for (; i < data.num_entries; ++i) {
734 offset = unpack_bits(entries[i], data.entry_bits, ptr, offset);
735 }
736 // undo deltas
737 uint64_t previous = 0;
738 for (i = 0; i < data.num_entries; ++i) {
739 entries[i] += previous;
740 previous = entries[i];
741 }
742 return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::move(entries));
743 }
744}
745
746// wrapped compact sketch
747
748template<typename A>
749wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(const data_type& data):
750data_(data)
751{}
752
753template<typename A>
754const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
755 return wrapped_compact_theta_sketch_alloc(compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error));
756}
757
758template<typename A>
762
763template<typename A>
765 return data_.is_empty;
766}
767
768template<typename A>
770 return data_.is_ordered;
771}
772
773template<typename A>
775 return data_.theta;
776}
777
778template<typename A>
780 return data_.num_entries;
781}
782
783template<typename A>
785 return data_.seed_hash;
786}
787
788template<typename A>
789auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
790 return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, 0);
791}
792
793template<typename A>
794auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
795 return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, data_.num_entries);
796}
797
798template<typename A>
799void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
800
801template<typename A>
802void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
803 os << "### Retained entries" << std::endl;
804 for (const auto hash: *this) {
805 os << hash << std::endl;
806 }
807 os << "### End retained entries" << std::endl;
808}
809
810// assumes index == 0 or index == num_entries
811template<typename Allocator>
812wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::const_iterator(
813 const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index):
814ptr_(ptr),
815entry_bits_(entry_bits),
816num_entries_(num_entries),
817index_(index),
818previous_(0),
819is_block_mode_(num_entries_ >= 8),
820buf_i_(0),
821offset_(0)
822{
823 if (entry_bits == 64) { // no compression
824 ptr_ = reinterpret_cast<const uint64_t*>(ptr) + index;
825 } else if (index < num_entries) {
826 if (is_block_mode_) {
827 unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
828 ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
829 for (int i = 0; i < 8; ++i) {
830 buffer_[i] += previous_;
831 previous_ = buffer_[i];
832 }
833 } else {
834 offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
835 buffer_[0] += previous_;
836 previous_ = buffer_[0];
837 }
838 }
839}
840
841template<typename Allocator>
842auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++() -> const_iterator& {
843 if (entry_bits_ == 64) { // no compression
844 ptr_ = reinterpret_cast<const uint64_t*>(ptr_) + 1;
845 return *this;
846 }
847 ++index_;
848 if (index_ < num_entries_) {
849 if (is_block_mode_) {
850 ++buf_i_;
851 if (buf_i_ == 8) {
852 buf_i_ = 0;
853 if (index_ + 8 < num_entries_) {
854 unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
855 ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
856 for (int i = 0; i < 8; ++i) {
857 buffer_[i] += previous_;
858 previous_ = buffer_[i];
859 }
860 } else {
861 is_block_mode_ = false;
862 offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
863 buffer_[0] += previous_;
864 previous_ = buffer_[0];
865 }
866 }
867 } else {
868 offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
869 buffer_[0] += previous_;
870 previous_ = buffer_[0];
871 }
872 }
873 return *this;
874}
875
876template<typename Allocator>
877auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++(int) -> const_iterator {
878 const_iterator tmp(*this);
879 operator++();
880 return tmp;
881}
882
883template<typename Allocator>
884bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator!=(const const_iterator& other) const {
885 if (entry_bits_ == 64) return ptr_ != other.ptr_;
886 return index_ != other.index_;
887}
888
889template<typename Allocator>
890bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator==(const const_iterator& other) const {
891 if (entry_bits_ == 64) return ptr_ == other.ptr_;
892 return index_ == other.index_;
893}
894
895template<typename Allocator>
896auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator*() const -> reference {
897 if (entry_bits_ == 64) return *reinterpret_cast<const uint64_t*>(ptr_);
898 return buffer_[buf_i_];
899}
900
901template<typename Allocator>
902auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator->() const -> pointer {
903 if (entry_bits_ == 64) return reinterpret_cast<const uint64_t*>(ptr_);
904 return buffer_ + buf_i_;
905}
906
907} /* namespace datasketches */
908
909#endif
double get_estimate() const
Definition theta_sketch_impl.hpp:47
double get_lower_bound(uint8_t num_std_devs) const
Returns the approximate lower error bound given a number of standard deviations.
Definition theta_sketch_impl.hpp:52
virtual string< Allocator > to_string(bool print_items=false) const
Provides a human-readable summary of this sketch as a string.
Definition theta_sketch_impl.hpp:64
double get_upper_bound(uint8_t num_std_devs) const
Returns the approximate upper error bound given a number of standard deviations.
Definition theta_sketch_impl.hpp:58
double get_theta() const
Definition theta_sketch_impl.hpp:41
bool is_estimation_mode() const
Definition theta_sketch_impl.hpp:36
Compact Theta sketch.
Definition theta_sketch.hpp:359
compact_theta_sketch_alloc(const Other &other, bool ordered)
Copy constructor.
Definition theta_sketch_impl.hpp:267
void serialize(std::ostream &os) const
This method serializes the sketch into a given stream in a binary form.
Definition theta_sketch_impl.hpp:378
virtual uint64_t get_theta64() const
Definition theta_sketch_impl.hpp:307
virtual uint32_t get_num_retained() const
Definition theta_sketch_impl.hpp:312
virtual bool is_empty() const
Definition theta_sketch_impl.hpp:297
virtual bool is_ordered() const
Definition theta_sketch_impl.hpp:302
virtual uint16_t get_seed_hash() const
Definition theta_sketch_impl.hpp:317
virtual iterator end()
Iterator pointing past the valid range.
Definition theta_sketch_impl.hpp:327
static size_t get_max_serialized_size_bytes(uint8_t lg_k)
Computes maximum serialized size in bytes.
Definition theta_sketch_impl.hpp:353
static compact_theta_sketch_alloc deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
virtual Allocator get_allocator() const
Definition theta_sketch_impl.hpp:292
size_t get_serialized_size_bytes(bool compressed=false) const
Computes size in bytes required to serialize the current state of the sketch.
Definition theta_sketch_impl.hpp:358
void serialize_compressed(std::ostream &os) const
This method serializes the sketch into a given stream in a compressed binary form.
Definition theta_sketch_impl.hpp:435
virtual iterator begin()
Iterator over hash values in this sketch.
Definition theta_sketch_impl.hpp:322
Theta base builder.
Definition theta_update_sketch_base.hpp:97
Base class for the Theta Sketch, a generalization of the Kth Minimum Value (KMV) sketch.
Definition theta_sketch.hpp:127
Update Theta sketch builder.
Definition theta_sketch.hpp:526
update_theta_sketch_alloc build() const
Definition theta_sketch_impl.hpp:259
Update Theta sketch.
Definition theta_sketch.hpp:175
void trim()
Remove retained entries in excess of the nominal size k (if any)
Definition theta_sketch_impl.hpp:212
virtual uint64_t get_theta64() const
Definition theta_sketch_impl.hpp:121
virtual bool is_empty() const
Definition theta_sketch_impl.hpp:111
virtual bool is_ordered() const
Definition theta_sketch_impl.hpp:116
virtual uint16_t get_seed_hash() const
Definition theta_sketch_impl.hpp:131
virtual Allocator get_allocator() const
Definition theta_sketch_impl.hpp:106
void reset()
Reset the sketch to the initial empty state.
Definition theta_sketch_impl.hpp:217
update_theta_sketch_alloc(const update_theta_sketch_alloc &other)=default
Copy constructor.
Wrapped Compact Theta sketch.
Definition theta_sketch.hpp:543
uint64_t get_theta64() const
Definition theta_sketch_impl.hpp:774
uint32_t get_num_retained() const
Definition theta_sketch_impl.hpp:779
bool is_empty() const
Definition theta_sketch_impl.hpp:764
bool is_ordered() const
Definition theta_sketch_impl.hpp:769
uint16_t get_seed_hash() const
Definition theta_sketch_impl.hpp:784
static const wrapped_compact_theta_sketch_alloc wrap(const void *bytes, size_t size, uint64_t seed=DEFAULT_SEED, bool dump_on_error=false)
This method wraps a serialized compact sketch as an array of bytes.
Definition theta_sketch_impl.hpp:754
Allocator get_allocator() const
Definition theta_sketch_impl.hpp:759
const_iterator begin() const
Const iterator over hash values in this sketch.
Definition theta_sketch_impl.hpp:789
const_iterator end() const
Const iterator pointing past the valid range.
Definition theta_sketch_impl.hpp:794
const uint64_t MAX_THETA
max theta - signed max for compatibility with Java
Definition theta_constants.hpp:36
DataSketches namespace.
Definition binomial_bounds.hpp:38