91 os << hash << std::endl;
93 os <<
"### End retained entries" << std::endl;
101 float p, uint64_t theta, uint64_t seed,
const A& allocator):
102table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
107 return table_.allocator_;
112 return table_.is_empty_;
117 return table_.num_entries_ > 1 ? false :
true;
122 return is_empty() ? theta_constants::MAX_THETA : table_.theta_;
127 return table_.num_entries_;
132 return compute_seed_hash(table_.seed_);
137 return table_.lg_nom_size_;
147 update(&value,
sizeof(value));
152 update(&value,
sizeof(value));
157 update(
static_cast<int32_t
>(value));
162 update(
static_cast<int64_t
>(value));
167 update(
static_cast<int16_t
>(value));
172 update(
static_cast<int64_t
>(value));
177 update(
static_cast<int8_t
>(value));
182 update(
static_cast<int64_t
>(value));
187 update(canonical_double(value));
192 update(
static_cast<double>(value));
197 if (value.empty())
return;
198 update(value.c_str(), value.length());
203 const uint64_t hash = table_.hash_and_screen(data, length);
204 if (hash == 0)
return;
205 auto result = table_.find(hash);
206 if (!result.second) {
207 table_.insert(result.first, hash);
223 return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
228 return iterator(
nullptr, 0, 1 << table_.lg_cur_size_);
233 return const_iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
238 return const_iterator(
nullptr, 0, 1 << table_.lg_cur_size_);
248 os <<
" lg nominal size : " <<
static_cast<int>(table_.lg_nom_size_) << std::endl;
249 os <<
" lg current size : " <<
static_cast<int>(table_.lg_cur_size_) << std::endl;
250 os <<
" resize factor : " << (1 << table_.rf_) << std::endl;
260 return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
266template<
typename Other>
274 if (!other.is_empty()) {
275 entries_.reserve(other.get_num_retained());
276 std::copy(other.begin(), other.end(), std::back_inserter(entries_));
277 if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
283 std::vector<uint64_t, A>&& entries):
285is_ordered_(is_ordered || (entries.size() <= 1ULL)),
286seed_hash_(seed_hash),
288entries_(std::move(entries))
293 return entries_.get_allocator();
313 return static_cast<uint32_t
>(entries_.size());
323 return iterator(entries_.data(),
static_cast<uint32_t
>(entries_.size()), 0);
328 return iterator(
nullptr, 0,
static_cast<uint32_t
>(entries_.size()));
333 return const_iterator(entries_.data(),
static_cast<uint32_t
>(entries_.size()), 0);
338 return const_iterator(
nullptr, 0,
static_cast<uint32_t
>(entries_.size()));
345uint8_t compact_theta_sketch_alloc<A>::get_preamble_longs(
bool compressed)
const {
347 return this->is_estimation_mode() ? 2 : 1;
349 return this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
359 if (compressed && is_suitable_for_compression()) {
360 return get_compressed_serialized_size_bytes(compute_entry_bits(), get_num_entries_bytes());
362 return sizeof(uint64_t) * get_preamble_longs(
false) +
sizeof(uint64_t) * entries_.size();
368 return whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(
static_cast<uint32_t
>(entries_.size())));
372size_t compact_theta_sketch_alloc<A>::get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes)
const {
373 const size_t compressed_bits = entry_bits * entries_.size();
374 return sizeof(uint64_t) * get_preamble_longs(
true) + num_entries_bytes + whole_bytes_to_hold_bits(compressed_bits);
379 const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
380 write(os, preamble_longs);
381 write(os, UNCOMPRESSED_SERIAL_VERSION);
382 write(os, SKETCH_TYPE);
383 write<uint16_t>(os, 0);
384 const uint8_t flags_byte(
385 (1 << flags::IS_COMPACT) |
386 (1 << flags::IS_READ_ONLY) |
387 (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
388 (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
390 write(os, flags_byte);
391 write(os, get_seed_hash());
392 if (preamble_longs > 1) {
393 write(os,
static_cast<uint32_t
>(entries_.size()));
394 write<uint32_t>(os, 0);
396 if (this->is_estimation_mode()) write(os, this->theta_);
397 if (entries_.size() > 0) write(os, entries_.data(), entries_.size() *
sizeof(uint64_t));
402 const size_t size = get_serialized_size_bytes() + header_size_bytes;
403 vector_bytes bytes(size, 0, entries_.get_allocator());
404 uint8_t* ptr = bytes.data() + header_size_bytes;
405 const uint8_t preamble_longs = get_preamble_longs(
false);
406 *ptr++ = preamble_longs;
407 *ptr++ = UNCOMPRESSED_SERIAL_VERSION;
408 *ptr++ = SKETCH_TYPE;
409 ptr +=
sizeof(uint16_t);
410 const uint8_t flags_byte(
411 (1 << flags::IS_COMPACT) |
412 (1 << flags::IS_READ_ONLY) |
413 (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
414 (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
417 ptr += copy_to_mem(get_seed_hash(), ptr);
418 if (preamble_longs > 1) {
419 ptr += copy_to_mem(
static_cast<uint32_t
>(entries_.size()), ptr);
420 ptr +=
sizeof(uint32_t);
422 if (this->is_estimation_mode()) ptr += copy_to_mem(theta_, ptr);
423 if (entries_.size() > 0) ptr += copy_to_mem(entries_.data(), ptr, entries_.size() *
sizeof(uint64_t));
429 if (!this->is_ordered() || entries_.size() == 0 ||
430 (entries_.size() == 1 && !this->is_estimation_mode()))
return false;
436 if (is_suitable_for_compression())
return serialize_version_4(os);
437 return serialize(os);
442 if (is_suitable_for_compression())
return serialize_version_4(header_size_bytes);
443 return serialize(header_size_bytes);
450 uint64_t previous = 0;
452 for (
const uint64_t entry: entries_) {
453 const uint64_t delta = entry - previous;
457 return 64 - count_leading_zeros_in_u64(ored);
461void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os)
const {
462 const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
463 const uint8_t entry_bits = compute_entry_bits();
464 const uint8_t num_entries_bytes = get_num_entries_bytes();
466 write(os, preamble_longs);
467 write(os, COMPRESSED_SERIAL_VERSION);
468 write(os, SKETCH_TYPE);
469 write(os, entry_bits);
470 write(os, num_entries_bytes);
471 const uint8_t flags_byte(
472 (1 << flags::IS_COMPACT) |
473 (1 << flags::IS_READ_ONLY) |
474 (1 << flags::IS_ORDERED)
476 write(os, flags_byte);
477 write(os, get_seed_hash());
478 if (this->is_estimation_mode()) write(os, this->theta_);
479 uint32_t num_entries =
static_cast<uint32_t
>(entries_.size());
480 for (
unsigned i = 0; i < num_entries_bytes; ++i) {
481 write<uint8_t>(os, num_entries & 0xff);
485 uint64_t previous = 0;
487 vector_bytes buffer(entry_bits, 0, entries_.get_allocator());
491 for (i = 0; i + 7 < entries_.size(); i += 8) {
492 for (
unsigned j = 0; j < 8; ++j) {
493 deltas[j] = entries_[i + j] - previous;
494 previous = entries_[i + j];
496 pack_bits_block8(deltas, buffer.data(), entry_bits);
497 write(os, buffer.data(), buffer.size());
501 if (i < entries_.size()) {
503 uint8_t* ptr = buffer.data();
504 for (; i < entries_.size(); ++i) {
505 const uint64_t delta = entries_[i] - previous;
506 previous = entries_[i];
507 offset = pack_bits(delta, entry_bits, ptr, offset);
509 if (offset > 0) ++ptr;
510 write(os, buffer.data(), ptr - buffer.data());
515auto compact_theta_sketch_alloc<A>::serialize_version_4(
unsigned header_size_bytes)
const -> vector_bytes {
516 const uint8_t entry_bits = compute_entry_bits();
517 const uint8_t num_entries_bytes = get_num_entries_bytes();
518 const size_t size = get_compressed_serialized_size_bytes(entry_bits, num_entries_bytes) + header_size_bytes;
519 vector_bytes bytes(size, 0, entries_.get_allocator());
520 uint8_t* ptr = bytes.data() + header_size_bytes;
522 *ptr++ = get_preamble_longs(
true);
523 *ptr++ = COMPRESSED_SERIAL_VERSION;
524 *ptr++ = SKETCH_TYPE;
526 *ptr++ = num_entries_bytes;
527 const uint8_t flags_byte(
528 (1 << flags::IS_COMPACT) |
529 (1 << flags::IS_READ_ONLY) |
530 (1 << flags::IS_ORDERED)
533 ptr += copy_to_mem(get_seed_hash(), ptr);
534 if (this->is_estimation_mode()) {
535 ptr += copy_to_mem(theta_, ptr);
537 uint32_t num_entries =
static_cast<uint32_t
>(entries_.size());
538 for (
unsigned i = 0; i < num_entries_bytes; ++i) {
539 *ptr++ = num_entries & 0xff;
543 uint64_t previous = 0;
548 for (i = 0; i + 7 < entries_.size(); i += 8) {
549 for (
unsigned j = 0; j < 8; ++j) {
550 deltas[j] = entries_[i + j] - previous;
551 previous = entries_[i + j];
553 pack_bits_block8(deltas, ptr, entry_bits);
559 for (; i < entries_.size(); ++i) {
560 const uint64_t delta = entries_[i] - previous;
561 previous = entries_[i];
562 offset = pack_bits(delta, entry_bits, ptr, offset);
569 const auto preamble_longs = read<uint8_t>(is);
570 const auto serial_version = read<uint8_t>(is);
571 const auto type = read<uint8_t>(is);
572 checker<true>::check_sketch_type(type, SKETCH_TYPE);
573 switch (serial_version) {
575 return deserialize_v4(preamble_longs, is, seed, allocator);
577 return deserialize_v3(preamble_longs, is, seed, allocator);
579 return deserialize_v1(preamble_longs, is, seed, allocator);
581 return deserialize_v2(preamble_longs, is, seed, allocator);
583 throw std::invalid_argument(
"unexpected sketch serialization version " + std::to_string(serial_version));
588compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v1(
589 uint8_t, std::istream& is, uint64_t seed,
const A& allocator)
591 const auto seed_hash = compute_seed_hash(seed);
594 const auto num_entries = read<uint32_t>(is);
596 const auto theta = read<uint64_t>(is);
597 std::vector<uint64_t, A> entries(num_entries, 0, allocator);
599 if (!is_empty) read(is, entries.data(),
sizeof(uint64_t) * entries.size());
600 if (!is.good())
throw std::runtime_error(
"error reading from std::istream");
601 return compact_theta_sketch_alloc(is_empty,
true, seed_hash, theta, std::move(entries));
605compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v2(
606 uint8_t preamble_longs, std::istream& is, uint64_t seed,
const A& allocator)
610 const uint16_t seed_hash = read<uint16_t>(is);
611 checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
612 if (preamble_longs == 1) {
613 if (!is.good())
throw std::runtime_error(
"error reading from std::istream");
614 std::vector<uint64_t, A> entries(0, 0, allocator);
616 }
else if (preamble_longs == 2) {
617 const uint32_t num_entries = read<uint32_t>(is);
619 std::vector<uint64_t, A> entries(num_entries, 0, allocator);
620 if (num_entries == 0) {
623 read(is, entries.data(), entries.size() *
sizeof(uint64_t));
624 if (!is.good())
throw std::runtime_error(
"error reading from std::istream");
626 }
else if (preamble_longs == 3) {
627 const uint32_t num_entries = read<uint32_t>(is);
629 const auto theta = read<uint64_t>(is);
631 std::vector<uint64_t, A> entries(num_entries, 0, allocator);
633 if (!is.good())
throw std::runtime_error(
"error reading from std::istream");
634 return compact_theta_sketch_alloc(
true,
true, seed_hash, theta, std::move(entries));
636 read(is, entries.data(),
sizeof(uint64_t) * entries.size());
637 if (!is.good())
throw std::runtime_error(
"error reading from std::istream");
638 return compact_theta_sketch_alloc(
false,
true, seed_hash, theta, std::move(entries));
641 throw std::invalid_argument(std::to_string(preamble_longs) +
" longs of premable, but expected 1, 2, or 3");
646compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v3(
647 uint8_t preamble_longs, std::istream& is, uint64_t seed,
const A& allocator)
650 const auto flags_byte = read<uint8_t>(is);
651 const auto seed_hash = read<uint16_t>(is);
652 const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
653 if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
655 uint32_t num_entries = 0;
657 if (preamble_longs == 1) {
660 num_entries = read<uint32_t>(is);
662 if (preamble_longs > 2) theta = read<uint64_t>(is);
665 std::vector<uint64_t, A> entries(num_entries, 0, allocator);
666 if (!is_empty) read(is, entries.data(),
sizeof(uint64_t) * entries.size());
667 const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
668 if (!is.good())
throw std::runtime_error(
"error reading from std::istream");
669 return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
673compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v4(
674 uint8_t preamble_longs, std::istream& is, uint64_t seed,
const A& allocator)
676 const auto entry_bits = read<uint8_t>(is);
677 const auto num_entries_bytes = read<uint8_t>(is);
678 const auto flags_byte = read<uint8_t>(is);
679 const auto seed_hash = read<uint16_t>(is);
680 const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
681 if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
683 if (preamble_longs > 1) theta = read<uint64_t>(is);
684 uint32_t num_entries = 0;
685 for (
unsigned i = 0; i < num_entries_bytes; ++i) {
686 num_entries |= read<uint8_t>(is) << (i << 3);
688 vector_bytes buffer(entry_bits, 0, allocator);
689 std::vector<uint64_t, A> entries(num_entries, 0, allocator);
693 for (i = 0; i + 7 < num_entries; i += 8) {
694 read(is, buffer.data(), buffer.size());
695 unpack_bits_block8(&entries[i], buffer.data(), entry_bits);
698 if (i < num_entries) read(is, buffer.data(), whole_bytes_to_hold_bits((num_entries - i) * entry_bits));
699 if (!is.good())
throw std::runtime_error(
"error reading from std::istream");
700 const uint8_t* ptr = buffer.data();
702 for (; i < num_entries; ++i) {
703 offset = unpack_bits(entries[i], entry_bits, ptr, offset);
706 uint64_t previous = 0;
707 for (i = 0; i < num_entries; ++i) {
708 entries[i] += previous;
709 previous = entries[i];
711 const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
712 return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
717 auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed,
false);
718 if (data.entry_bits == 64) {
719 const uint64_t* entries =
reinterpret_cast<const uint64_t*
>(data.entries_start_ptr);
720 return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta,
721 std::vector<uint64_t, A>(entries, entries + data.num_entries, allocator));
723 std::vector<uint64_t, A> entries(data.num_entries, 0, allocator);
724 const uint8_t* ptr =
reinterpret_cast<const uint8_t*
>(data.entries_start_ptr);
727 for (i = 0; i + 7 < data.num_entries; i += 8) {
728 unpack_bits_block8(&entries[i], ptr, data.entry_bits);
729 ptr += data.entry_bits;
733 for (; i < data.num_entries; ++i) {
734 offset = unpack_bits(entries[i], data.entry_bits, ptr, offset);
737 uint64_t previous = 0;
738 for (i = 0; i < data.num_entries; ++i) {
739 entries[i] += previous;
740 previous = entries[i];
742 return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::move(entries));
749wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(
const data_type& data):
765 return data_.is_empty;
770 return data_.is_ordered;
780 return data_.num_entries;
785 return data_.seed_hash;
790 return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, 0);
795 return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, data_.num_entries);
802void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os)
const {
803 os <<
"### Retained entries" << std::endl;
804 for (
const auto hash: *this) {
805 os << hash << std::endl;
807 os <<
"### End retained entries" << std::endl;
811template<
typename Allocator>
812wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::const_iterator(
813 const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index):
815entry_bits_(entry_bits),
816num_entries_(num_entries),
819is_block_mode_(num_entries_ >= 8),
823 if (entry_bits == 64) {
824 ptr_ =
reinterpret_cast<const uint64_t*
>(ptr) + index;
825 }
else if (index < num_entries) {
826 if (is_block_mode_) {
827 unpack_bits_block8(buffer_,
reinterpret_cast<const uint8_t*
>(ptr_), entry_bits_);
828 ptr_ =
reinterpret_cast<const uint8_t*
>(ptr_) + entry_bits_;
829 for (
int i = 0; i < 8; ++i) {
830 buffer_[i] += previous_;
831 previous_ = buffer_[i];
834 offset_ = unpack_bits(buffer_[0], entry_bits_,
reinterpret_cast<const uint8_t*&
>(ptr_), offset_);
835 buffer_[0] += previous_;
836 previous_ = buffer_[0];
841template<
typename Allocator>
842auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++() -> const_iterator& {
843 if (entry_bits_ == 64) {
844 ptr_ =
reinterpret_cast<const uint64_t*
>(ptr_) + 1;
848 if (index_ < num_entries_) {
849 if (is_block_mode_) {
853 if (index_ + 8 < num_entries_) {
854 unpack_bits_block8(buffer_,
reinterpret_cast<const uint8_t*
>(ptr_), entry_bits_);
855 ptr_ =
reinterpret_cast<const uint8_t*
>(ptr_) + entry_bits_;
856 for (
int i = 0; i < 8; ++i) {
857 buffer_[i] += previous_;
858 previous_ = buffer_[i];
861 is_block_mode_ =
false;
862 offset_ = unpack_bits(buffer_[0], entry_bits_,
reinterpret_cast<const uint8_t*&
>(ptr_), offset_);
863 buffer_[0] += previous_;
864 previous_ = buffer_[0];
868 offset_ = unpack_bits(buffer_[0], entry_bits_,
reinterpret_cast<const uint8_t*&
>(ptr_), offset_);
869 buffer_[0] += previous_;
870 previous_ = buffer_[0];
876template<
typename Allocator>
877auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++(
int) -> const_iterator {
878 const_iterator tmp(*
this);
883template<
typename Allocator>
884bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator!=(
const const_iterator& other)
const {
885 if (entry_bits_ == 64)
return ptr_ != other.ptr_;
886 return index_ != other.index_;
889template<
typename Allocator>
890bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator==(
const const_iterator& other)
const {
891 if (entry_bits_ == 64)
return ptr_ == other.ptr_;
892 return index_ == other.index_;
895template<
typename Allocator>
896auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator*() const -> reference {
897 if (entry_bits_ == 64)
return *
reinterpret_cast<const uint64_t*
>(ptr_);
898 return buffer_[buf_i_];
901template<
typename Allocator>
902auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator->() const -> pointer {
903 if (entry_bits_ == 64)
return reinterpret_cast<const uint64_t*
>(ptr_);
904 return buffer_ + buf_i_;