datasketches-cpp
theta_sketch_impl.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef THETA_SKETCH_IMPL_HPP_
21 #define THETA_SKETCH_IMPL_HPP_
22 
23 #include <sstream>
24 #include <vector>
25 #include <stdexcept>
26 
27 #include "binomial_bounds.hpp"
28 #include "theta_helpers.hpp"
29 #include "count_zeros.hpp"
30 #include "bit_packing.hpp"
31 #include "memory_operations.hpp"
32 
33 namespace datasketches {
34 
35 template<typename A>
37  return get_theta64() < theta_constants::MAX_THETA && !is_empty();
38 }
39 
40 template<typename A>
42  return static_cast<double>(get_theta64()) /
43  static_cast<double>(theta_constants::MAX_THETA);
44 }
45 
46 template<typename A>
48  return get_num_retained() / get_theta();
49 }
50 
51 template<typename A>
52 double base_theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
53  if (!is_estimation_mode()) return get_num_retained();
54  return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
55 }
56 
57 template<typename A>
58 double base_theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
59  if (!is_estimation_mode()) return get_num_retained();
60  return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
61 }
62 
63 template<typename A>
64 string<A> base_theta_sketch_alloc<A>::to_string(bool print_details) const {
65  // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
66  // The stream does not support passing an allocator instance, and alternatives are complicated.
67  std::ostringstream os;
68  os << "### Theta sketch summary:" << std::endl;
69  os << " num retained entries : " << this->get_num_retained() << std::endl;
70  os << " seed hash : " << this->get_seed_hash() << std::endl;
71  os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
72  os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
73  os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
74  os << " theta (fraction) : " << this->get_theta() << std::endl;
75  os << " theta (raw 64-bit) : " << this->get_theta64() << std::endl;
76  os << " estimate : " << this->get_estimate() << std::endl;
77  os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
78  os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
79  print_specifics(os);
80  os << "### End sketch summary" << std::endl;
81  if (print_details) {
82  print_items(os);
83  }
84  return string<A>(os.str().c_str(), this->get_allocator());
85 }
86 
87 template<typename A>
88 void theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
89  os << "### Retained entries" << std::endl;
90  for (const auto& hash: *this) {
91  os << hash << std::endl;
92  }
93  os << "### End retained entries" << std::endl;
94 }
95 
96 
97 // update sketch
98 
99 template<typename A>
100 update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
101  float p, uint64_t theta, uint64_t seed, const A& allocator):
102 table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
103 {}
104 
105 template<typename A>
107  return table_.allocator_;
108 }
109 
110 template<typename A>
112  return table_.is_empty_;
113 }
114 
115 template<typename A>
117  return table_.num_entries_ > 1 ? false : true;
118 }
119 
120 template<typename A>
122  return is_empty() ? theta_constants::MAX_THETA : table_.theta_;
123 }
124 
125 template<typename A>
127  return table_.num_entries_;
128 }
129 
130 template<typename A>
132  return compute_seed_hash(table_.seed_);
133 }
134 
135 template<typename A>
137  return table_.lg_nom_size_;
138 }
139 
140 template<typename A>
141 auto update_theta_sketch_alloc<A>::get_rf() const -> resize_factor {
142  return table_.rf_;
143 }
144 
145 template<typename A>
147  update(&value, sizeof(value));
148 }
149 
150 template<typename A>
152  update(&value, sizeof(value));
153 }
154 
155 template<typename A>
157  update(static_cast<int32_t>(value));
158 }
159 
160 template<typename A>
162  update(static_cast<int64_t>(value));
163 }
164 
165 template<typename A>
167  update(static_cast<int16_t>(value));
168 }
169 
170 template<typename A>
172  update(static_cast<int64_t>(value));
173 }
174 
175 template<typename A>
177  update(static_cast<int8_t>(value));
178 }
179 
180 template<typename A>
182  update(static_cast<int64_t>(value));
183 }
184 
185 template<typename A>
187  update(canonical_double(value));
188 }
189 
190 template<typename A>
192  update(static_cast<double>(value));
193 }
194 
195 template<typename A>
196 void update_theta_sketch_alloc<A>::update(const std::string& value) {
197  if (value.empty()) return;
198  update(value.c_str(), value.length());
199 }
200 
201 template<typename A>
202 void update_theta_sketch_alloc<A>::update(const void* data, size_t length) {
203  const uint64_t hash = table_.hash_and_screen(data, length);
204  if (hash == 0) return;
205  auto result = table_.find(hash);
206  if (!result.second) {
207  table_.insert(result.first, hash);
208  }
209 }
210 
211 template<typename A>
213  table_.trim();
214 }
215 
216 template<typename A>
218  table_.reset();
219 }
220 
221 template<typename A>
223  return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
224 }
225 
226 template<typename A>
228  return iterator(nullptr, 0, 1 << table_.lg_cur_size_);
229 }
230 
231 template<typename A>
232 auto update_theta_sketch_alloc<A>::begin() const -> const_iterator {
233  return const_iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
234 }
235 
236 template<typename A>
237 auto update_theta_sketch_alloc<A>::end() const -> const_iterator {
238  return const_iterator(nullptr, 0, 1 << table_.lg_cur_size_);
239 }
240 
241 template<typename A>
243  return compact_theta_sketch_alloc<A>(*this, ordered);
244 }
245 
246 template<typename A>
247 void update_theta_sketch_alloc<A>::print_specifics(std::ostringstream& os) const {
248  os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
249  os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
250  os << " resize factor : " << (1 << table_.rf_) << std::endl;
251 }
252 
253 // builder
254 
255 template<typename A>
257 
258 template<typename A>
260  return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
261 }
262 
263 // compact sketch
264 
265 template<typename A>
266 template<typename Other>
268 is_empty_(other.is_empty()),
269 is_ordered_(other.is_ordered() || ordered),
270 seed_hash_(other.get_seed_hash()),
271 theta_(other.get_theta64()),
272 entries_(other.get_allocator())
273 {
274  if (!other.is_empty()) {
275  entries_.reserve(other.get_num_retained());
276  std::copy(other.begin(), other.end(), std::back_inserter(entries_));
277  if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
278  }
279 }
280 
281 template<typename A>
282 compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
283  std::vector<uint64_t, A>&& entries):
284 is_empty_(is_empty),
285 is_ordered_(is_ordered || (entries.size() <= 1ULL)),
286 seed_hash_(seed_hash),
287 theta_(theta),
288 entries_(std::move(entries))
289 {}
290 
291 template<typename A>
293  return entries_.get_allocator();
294 }
295 
296 template<typename A>
298  return is_empty_;
299 }
300 
301 template<typename A>
303  return is_ordered_;
304 }
305 
306 template<typename A>
308  return theta_;
309 }
310 
311 template<typename A>
313  return static_cast<uint32_t>(entries_.size());
314 }
315 
316 template<typename A>
318  return seed_hash_;
319 }
320 
321 template<typename A>
323  return iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
324 }
325 
326 template<typename A>
328  return iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
329 }
330 
331 template<typename A>
332 auto compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
333  return const_iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
334 }
335 
336 template<typename A>
337 auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
338  return const_iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
339 }
340 
341 template<typename A>
342 void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
343 
344 template<typename A>
345 uint8_t compact_theta_sketch_alloc<A>::get_preamble_longs(bool compressed) const {
346  if (compressed) {
347  return this->is_estimation_mode() ? 2 : 1;
348  }
349  return this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
350 }
351 
352 template<typename A>
354  return sizeof(uint64_t) * (3 + update_theta_sketch_alloc<A>::theta_table::get_capacity(lg_k + 1, lg_k));
355 }
356 
357 template<typename A>
359  if (compressed && is_suitable_for_compression()) {
360  return get_compressed_serialized_size_bytes(compute_entry_bits(), get_num_entries_bytes());
361  }
362  return sizeof(uint64_t) * get_preamble_longs(false) + sizeof(uint64_t) * entries_.size();
363 }
364 
365 // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
366 template<typename A>
368  return whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
369 }
370 
371 template<typename A>
372 size_t compact_theta_sketch_alloc<A>::get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const {
373  const size_t compressed_bits = entry_bits * entries_.size();
374  return sizeof(uint64_t) * get_preamble_longs(true) + num_entries_bytes + whole_bytes_to_hold_bits(compressed_bits);
375 }
376 
377 template<typename A>
378 void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
379  const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
380  write(os, preamble_longs);
381  write(os, UNCOMPRESSED_SERIAL_VERSION);
382  write(os, SKETCH_TYPE);
383  write<uint16_t>(os, 0); // unused
384  const uint8_t flags_byte(
385  (1 << flags::IS_COMPACT) |
386  (1 << flags::IS_READ_ONLY) |
387  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
388  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
389  );
390  write(os, flags_byte);
391  write(os, get_seed_hash());
392  if (preamble_longs > 1) {
393  write(os, static_cast<uint32_t>(entries_.size()));
394  write<uint32_t>(os, 0); // unused
395  }
396  if (this->is_estimation_mode()) write(os, this->theta_);
397  if (entries_.size() > 0) write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
398 }
399 
400 template<typename A>
401 auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
402  const size_t size = get_serialized_size_bytes() + header_size_bytes;
403  vector_bytes bytes(size, 0, entries_.get_allocator());
404  uint8_t* ptr = bytes.data() + header_size_bytes;
405  const uint8_t preamble_longs = get_preamble_longs(false);
406  *ptr++ = preamble_longs;
407  *ptr++ = UNCOMPRESSED_SERIAL_VERSION;
408  *ptr++ = SKETCH_TYPE;
409  ptr += sizeof(uint16_t); // unused
410  const uint8_t flags_byte(
411  (1 << flags::IS_COMPACT) |
412  (1 << flags::IS_READ_ONLY) |
413  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
414  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
415  );
416  *ptr++ = flags_byte;
417  ptr += copy_to_mem(get_seed_hash(), ptr);
418  if (preamble_longs > 1) {
419  ptr += copy_to_mem(static_cast<uint32_t>(entries_.size()), ptr);
420  ptr += sizeof(uint32_t); // unused
421  }
422  if (this->is_estimation_mode()) ptr += copy_to_mem(theta_, ptr);
423  if (entries_.size() > 0) ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
424  return bytes;
425 }
426 
427 template<typename A>
429  if (!this->is_ordered() || entries_.size() == 0 ||
430  (entries_.size() == 1 && !this->is_estimation_mode())) return false;
431  return true;
432 }
433 
434 template<typename A>
436  if (is_suitable_for_compression()) return serialize_version_4(os);
437  return serialize(os);
438 }
439 
440 template<typename A>
441 auto compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_bytes) const -> vector_bytes {
442  if (is_suitable_for_compression()) return serialize_version_4(header_size_bytes);
443  return serialize(header_size_bytes);
444 }
445 
446 template<typename A>
448  // compression is based on leading zeros in deltas between ordered hash values
449  // assumes ordered sketch
450  uint64_t previous = 0;
451  uint64_t ored = 0;
452  for (const uint64_t entry: entries_) {
453  const uint64_t delta = entry - previous;
454  ored |= delta;
455  previous = entry;
456  }
457  return 64 - count_leading_zeros_in_u64(ored);
458 }
459 
460 template<typename A>
461 void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const {
462  const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
463  const uint8_t entry_bits = compute_entry_bits();
464  const uint8_t num_entries_bytes = get_num_entries_bytes();
465 
466  write(os, preamble_longs);
467  write(os, COMPRESSED_SERIAL_VERSION);
468  write(os, SKETCH_TYPE);
469  write(os, entry_bits);
470  write(os, num_entries_bytes);
471  const uint8_t flags_byte(
472  (1 << flags::IS_COMPACT) |
473  (1 << flags::IS_READ_ONLY) |
474  (1 << flags::IS_ORDERED)
475  );
476  write(os, flags_byte);
477  write(os, get_seed_hash());
478  if (this->is_estimation_mode()) write(os, this->theta_);
479  uint32_t num_entries = static_cast<uint32_t>(entries_.size());
480  for (unsigned i = 0; i < num_entries_bytes; ++i) {
481  write<uint8_t>(os, num_entries & 0xff);
482  num_entries >>= 8;
483  }
484 
485  uint64_t previous = 0;
486  uint64_t deltas[8];
487  vector_bytes buffer(entry_bits, 0, entries_.get_allocator()); // block of 8 entries takes entry_bits bytes
488 
489  // pack blocks of 8 deltas
490  unsigned i;
491  for (i = 0; i + 7 < entries_.size(); i += 8) {
492  for (unsigned j = 0; j < 8; ++j) {
493  deltas[j] = entries_[i + j] - previous;
494  previous = entries_[i + j];
495  }
496  pack_bits_block8(deltas, buffer.data(), entry_bits);
497  write(os, buffer.data(), buffer.size());
498  }
499 
500  // pack extra deltas if fewer than 8 of them left
501  if (i < entries_.size()) {
502  uint8_t offset = 0;
503  uint8_t* ptr = buffer.data();
504  for (; i < entries_.size(); ++i) {
505  const uint64_t delta = entries_[i] - previous;
506  previous = entries_[i];
507  offset = pack_bits(delta, entry_bits, ptr, offset);
508  }
509  if (offset > 0) ++ptr;
510  write(os, buffer.data(), ptr - buffer.data());
511  }
512 }
513 
514 template<typename A>
515 auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_bytes) const -> vector_bytes {
516  const uint8_t entry_bits = compute_entry_bits();
517  const uint8_t num_entries_bytes = get_num_entries_bytes();
518  const size_t size = get_compressed_serialized_size_bytes(entry_bits, num_entries_bytes) + header_size_bytes;
519  vector_bytes bytes(size, 0, entries_.get_allocator());
520  uint8_t* ptr = bytes.data() + header_size_bytes;
521 
522  *ptr++ = get_preamble_longs(true);
523  *ptr++ = COMPRESSED_SERIAL_VERSION;
524  *ptr++ = SKETCH_TYPE;
525  *ptr++ = entry_bits;
526  *ptr++ = num_entries_bytes;
527  const uint8_t flags_byte(
528  (1 << flags::IS_COMPACT) |
529  (1 << flags::IS_READ_ONLY) |
530  (1 << flags::IS_ORDERED)
531  );
532  *ptr++ = flags_byte;
533  ptr += copy_to_mem(get_seed_hash(), ptr);
534  if (this->is_estimation_mode()) {
535  ptr += copy_to_mem(theta_, ptr);
536  }
537  uint32_t num_entries = static_cast<uint32_t>(entries_.size());
538  for (unsigned i = 0; i < num_entries_bytes; ++i) {
539  *ptr++ = num_entries & 0xff;
540  num_entries >>= 8;
541  }
542 
543  uint64_t previous = 0;
544  uint64_t deltas[8];
545 
546  // pack blocks of 8 deltas
547  unsigned i;
548  for (i = 0; i + 7 < entries_.size(); i += 8) {
549  for (unsigned j = 0; j < 8; ++j) {
550  deltas[j] = entries_[i + j] - previous;
551  previous = entries_[i + j];
552  }
553  pack_bits_block8(deltas, ptr, entry_bits);
554  ptr += entry_bits;
555  }
556 
557  // pack extra deltas if fewer than 8 of them left
558  uint8_t offset = 0;
559  for (; i < entries_.size(); ++i) {
560  const uint64_t delta = entries_[i] - previous;
561  previous = entries_[i];
562  offset = pack_bits(delta, entry_bits, ptr, offset);
563  }
564  return bytes;
565 }
566 
567 template<typename A>
568 compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
569  const auto preamble_longs = read<uint8_t>(is);
570  const auto serial_version = read<uint8_t>(is);
571  const auto type = read<uint8_t>(is);
572  checker<true>::check_sketch_type(type, SKETCH_TYPE);
573  switch (serial_version) {
574  case 4:
575  return deserialize_v4(preamble_longs, is, seed, allocator);
576  case 3:
577  return deserialize_v3(preamble_longs, is, seed, allocator);
578  case 1:
579  return deserialize_v1(preamble_longs, is, seed, allocator);
580  case 2:
581  return deserialize_v2(preamble_longs, is, seed, allocator);
582  default:
583  throw std::invalid_argument("unexpected sketch serialization version " + std::to_string(serial_version));
584  }
585 }
586 
587 template<typename A>
588 compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v1(
589  uint8_t, std::istream& is, uint64_t seed, const A& allocator)
590 {
591  const auto seed_hash = compute_seed_hash(seed);
592  read<uint8_t>(is); // unused
593  read<uint32_t>(is); // unused
594  const auto num_entries = read<uint32_t>(is);
595  read<uint32_t>(is); //unused
596  const auto theta = read<uint64_t>(is);
597  std::vector<uint64_t, A> entries(num_entries, 0, allocator);
598  bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
599  if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
600  if (!is.good()) throw std::runtime_error("error reading from std::istream");
601  return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
602 }
603 
604 template<typename A>
605 compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v2(
606  uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
607 {
608  read<uint8_t>(is); // unused
609  read<uint16_t>(is); // unused
610  const uint16_t seed_hash = read<uint16_t>(is);
611  checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
612  if (preamble_longs == 1) {
613  if (!is.good()) throw std::runtime_error("error reading from std::istream");
614  std::vector<uint64_t, A> entries(0, 0, allocator);
615  return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
616  } else if (preamble_longs == 2) {
617  const uint32_t num_entries = read<uint32_t>(is);
618  read<uint32_t>(is); // unused
619  std::vector<uint64_t, A> entries(num_entries, 0, allocator);
620  if (num_entries == 0) {
621  return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
622  }
623  read(is, entries.data(), entries.size() * sizeof(uint64_t));
624  if (!is.good()) throw std::runtime_error("error reading from std::istream");
625  return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
626  } else if (preamble_longs == 3) {
627  const uint32_t num_entries = read<uint32_t>(is);
628  read<uint32_t>(is); // unused
629  const auto theta = read<uint64_t>(is);
630  bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
631  std::vector<uint64_t, A> entries(num_entries, 0, allocator);
632  if (is_empty) {
633  if (!is.good()) throw std::runtime_error("error reading from std::istream");
634  return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
635  } else {
636  read(is, entries.data(), sizeof(uint64_t) * entries.size());
637  if (!is.good()) throw std::runtime_error("error reading from std::istream");
638  return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
639  }
640  } else {
641  throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
642  }
643 }
644 
645 template<typename A>
646 compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v3(
647  uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
648 {
649  read<uint16_t>(is); // unused
650  const auto flags_byte = read<uint8_t>(is);
651  const auto seed_hash = read<uint16_t>(is);
652  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
653  if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
654  uint64_t theta = theta_constants::MAX_THETA;
655  uint32_t num_entries = 0;
656  if (!is_empty) {
657  if (preamble_longs == 1) {
658  num_entries = 1;
659  } else {
660  num_entries = read<uint32_t>(is);
661  read<uint32_t>(is); // unused
662  if (preamble_longs > 2) theta = read<uint64_t>(is);
663  }
664  }
665  std::vector<uint64_t, A> entries(num_entries, 0, allocator);
666  if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
667  const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
668  if (!is.good()) throw std::runtime_error("error reading from std::istream");
669  return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
670 }
671 
672 template<typename A>
673 compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v4(
674  uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
675 {
676  const auto entry_bits = read<uint8_t>(is);
677  const auto num_entries_bytes = read<uint8_t>(is);
678  const auto flags_byte = read<uint8_t>(is);
679  const auto seed_hash = read<uint16_t>(is);
680  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
681  if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
682  uint64_t theta = theta_constants::MAX_THETA;
683  if (preamble_longs > 1) theta = read<uint64_t>(is);
684  uint32_t num_entries = 0;
685  for (unsigned i = 0; i < num_entries_bytes; ++i) {
686  num_entries |= read<uint8_t>(is) << (i << 3);
687  }
688  vector_bytes buffer(entry_bits, 0, allocator); // block of 8 entries takes entry_bits bytes
689  std::vector<uint64_t, A> entries(num_entries, 0, allocator);
690 
691  // unpack blocks of 8 deltas
692  unsigned i;
693  for (i = 0; i + 7 < num_entries; i += 8) {
694  read(is, buffer.data(), buffer.size());
695  unpack_bits_block8(&entries[i], buffer.data(), entry_bits);
696  }
697  // unpack extra deltas if fewer than 8 of them left
698  if (i < num_entries) read(is, buffer.data(), whole_bytes_to_hold_bits((num_entries - i) * entry_bits));
699  if (!is.good()) throw std::runtime_error("error reading from std::istream");
700  const uint8_t* ptr = buffer.data();
701  uint8_t offset = 0;
702  for (; i < num_entries; ++i) {
703  offset = unpack_bits(entries[i], entry_bits, ptr, offset);
704  }
705  // undo deltas
706  uint64_t previous = 0;
707  for (i = 0; i < num_entries; ++i) {
708  entries[i] += previous;
709  previous = entries[i];
710  }
711  const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
712  return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
713 }
714 
715 template<typename A>
716 compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
717  auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, false);
718  if (data.entry_bits == 64) { // versions 1 to 3
719  const uint64_t* entries = reinterpret_cast<const uint64_t*>(data.entries_start_ptr);
720  return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta,
721  std::vector<uint64_t, A>(entries, entries + data.num_entries, allocator));
722  } else { // version 4
723  std::vector<uint64_t, A> entries(data.num_entries, 0, allocator);
724  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data.entries_start_ptr);
725  // unpack blocks of 8 deltas
726  unsigned i;
727  for (i = 0; i + 7 < data.num_entries; i += 8) {
728  unpack_bits_block8(&entries[i], ptr, data.entry_bits);
729  ptr += data.entry_bits;
730  }
731  // unpack extra deltas if fewer than 8 of them left
732  uint8_t offset = 0;
733  for (; i < data.num_entries; ++i) {
734  offset = unpack_bits(entries[i], data.entry_bits, ptr, offset);
735  }
736  // undo deltas
737  uint64_t previous = 0;
738  for (i = 0; i < data.num_entries; ++i) {
739  entries[i] += previous;
740  previous = entries[i];
741  }
742  return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::move(entries));
743  }
744 }
745 
746 // wrapped compact sketch
747 
748 template<typename A>
749 wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(const data_type& data):
750 data_(data)
751 {}
752 
753 template<typename A>
754 const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
755  return wrapped_compact_theta_sketch_alloc(compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error));
756 }
757 
758 template<typename A>
760  return A();
761 }
762 
763 template<typename A>
765  return data_.is_empty;
766 }
767 
768 template<typename A>
770  return data_.is_ordered;
771 }
772 
773 template<typename A>
775  return data_.theta;
776 }
777 
778 template<typename A>
780  return data_.num_entries;
781 }
782 
783 template<typename A>
785  return data_.seed_hash;
786 }
787 
788 template<typename A>
789 auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
790  return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, 0);
791 }
792 
793 template<typename A>
794 auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
795  return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, data_.num_entries);
796 }
797 
798 template<typename A>
799 void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
800 
801 template<typename A>
802 void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
803  os << "### Retained entries" << std::endl;
804  for (const auto hash: *this) {
805  os << hash << std::endl;
806  }
807  os << "### End retained entries" << std::endl;
808 }
809 
810 // assumes index == 0 or index == num_entries
811 template<typename Allocator>
812 wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::const_iterator(
813  const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index):
814 ptr_(ptr),
815 entry_bits_(entry_bits),
816 num_entries_(num_entries),
817 index_(index),
818 previous_(0),
819 is_block_mode_(num_entries_ >= 8),
820 buf_i_(0),
821 offset_(0)
822 {
823  if (entry_bits == 64) { // no compression
824  ptr_ = reinterpret_cast<const uint64_t*>(ptr) + index;
825  } else if (index < num_entries) {
826  if (is_block_mode_) {
827  unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
828  ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
829  for (int i = 0; i < 8; ++i) {
830  buffer_[i] += previous_;
831  previous_ = buffer_[i];
832  }
833  } else {
834  offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
835  buffer_[0] += previous_;
836  previous_ = buffer_[0];
837  }
838  }
839 }
840 
841 template<typename Allocator>
842 auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++() -> const_iterator& {
843  if (entry_bits_ == 64) { // no compression
844  ptr_ = reinterpret_cast<const uint64_t*>(ptr_) + 1;
845  return *this;
846  }
847  ++index_;
848  if (index_ < num_entries_) {
849  if (is_block_mode_) {
850  ++buf_i_;
851  if (buf_i_ == 8) {
852  buf_i_ = 0;
853  if (index_ + 8 < num_entries_) {
854  unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
855  ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
856  for (int i = 0; i < 8; ++i) {
857  buffer_[i] += previous_;
858  previous_ = buffer_[i];
859  }
860  } else {
861  is_block_mode_ = false;
862  offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
863  buffer_[0] += previous_;
864  previous_ = buffer_[0];
865  }
866  }
867  } else {
868  offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
869  buffer_[0] += previous_;
870  previous_ = buffer_[0];
871  }
872  }
873  return *this;
874 }
875 
876 template<typename Allocator>
877 auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++(int) -> const_iterator {
878  const_iterator tmp(*this);
879  operator++();
880  return tmp;
881 }
882 
883 template<typename Allocator>
884 bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator!=(const const_iterator& other) const {
885  if (entry_bits_ == 64) return ptr_ != other.ptr_;
886  return index_ != other.index_;
887 }
888 
889 template<typename Allocator>
890 bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator==(const const_iterator& other) const {
891  if (entry_bits_ == 64) return ptr_ == other.ptr_;
892  return index_ == other.index_;
893 }
894 
895 template<typename Allocator>
896 auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator*() const -> reference {
897  if (entry_bits_ == 64) return *reinterpret_cast<const uint64_t*>(ptr_);
898  return buffer_[buf_i_];
899 }
900 
901 template<typename Allocator>
902 auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator->() const -> pointer {
903  if (entry_bits_ == 64) return reinterpret_cast<const uint64_t*>(ptr_);
904  return buffer_ + buf_i_;
905 }
906 
907 } /* namespace datasketches */
908 
909 #endif
double get_estimate() const
Definition: theta_sketch_impl.hpp:47
double get_lower_bound(uint8_t num_std_devs) const
Returns the approximate lower error bound given a number of standard deviations.
Definition: theta_sketch_impl.hpp:52
virtual string< Allocator > to_string(bool print_items=false) const
Provides a human-readable summary of this sketch as a string.
Definition: theta_sketch_impl.hpp:64
double get_upper_bound(uint8_t num_std_devs) const
Returns the approximate upper error bound given a number of standard deviations.
Definition: theta_sketch_impl.hpp:58
double get_theta() const
Definition: theta_sketch_impl.hpp:41
bool is_estimation_mode() const
Definition: theta_sketch_impl.hpp:36
Compact Theta sketch.
Definition: theta_sketch.hpp:359
compact_theta_sketch_alloc(const Other &other, bool ordered)
Copy constructor.
Definition: theta_sketch_impl.hpp:267
void serialize(std::ostream &os) const
This method serializes the sketch into a given stream in a binary form.
Definition: theta_sketch_impl.hpp:378
virtual uint64_t get_theta64() const
Definition: theta_sketch_impl.hpp:307
virtual uint32_t get_num_retained() const
Definition: theta_sketch_impl.hpp:312
virtual bool is_empty() const
Definition: theta_sketch_impl.hpp:297
virtual bool is_ordered() const
Definition: theta_sketch_impl.hpp:302
virtual uint16_t get_seed_hash() const
Definition: theta_sketch_impl.hpp:317
virtual iterator end()
Iterator pointing past the valid range.
Definition: theta_sketch_impl.hpp:327
static size_t get_max_serialized_size_bytes(uint8_t lg_k)
Computes maximum serialized size in bytes.
Definition: theta_sketch_impl.hpp:353
static compact_theta_sketch_alloc deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
virtual Allocator get_allocator() const
Definition: theta_sketch_impl.hpp:292
size_t get_serialized_size_bytes(bool compressed=false) const
Computes size in bytes required to serialize the current state of the sketch.
Definition: theta_sketch_impl.hpp:358
void serialize_compressed(std::ostream &os) const
This method serializes the sketch into a given stream in a compressed binary form.
Definition: theta_sketch_impl.hpp:435
virtual iterator begin()
Iterator over hash values in this sketch.
Definition: theta_sketch_impl.hpp:322
Theta base builder.
Definition: theta_update_sketch_base.hpp:97
Base class for the Theta Sketch, a generalization of the Kth Minimum Value (KMV) sketch.
Definition: theta_sketch.hpp:127
Update Theta sketch builder.
Definition: theta_sketch.hpp:526
update_theta_sketch_alloc build() const
Definition: theta_sketch_impl.hpp:259
Update Theta sketch.
Definition: theta_sketch.hpp:175
void trim()
Remove retained entries in excess of the nominal size k (if any)
Definition: theta_sketch_impl.hpp:212
virtual uint64_t get_theta64() const
Definition: theta_sketch_impl.hpp:121
virtual bool is_empty() const
Definition: theta_sketch_impl.hpp:111
virtual bool is_ordered() const
Definition: theta_sketch_impl.hpp:116
virtual uint16_t get_seed_hash() const
Definition: theta_sketch_impl.hpp:131
virtual Allocator get_allocator() const
Definition: theta_sketch_impl.hpp:106
void reset()
Reset the sketch to the initial empty state.
Definition: theta_sketch_impl.hpp:217
update_theta_sketch_alloc(const update_theta_sketch_alloc &other)=default
Copy constructor.
Wrapped Compact Theta sketch.
Definition: theta_sketch.hpp:543
uint64_t get_theta64() const
Definition: theta_sketch_impl.hpp:774
uint32_t get_num_retained() const
Definition: theta_sketch_impl.hpp:779
bool is_empty() const
Definition: theta_sketch_impl.hpp:764
bool is_ordered() const
Definition: theta_sketch_impl.hpp:769
uint16_t get_seed_hash() const
Definition: theta_sketch_impl.hpp:784
static const wrapped_compact_theta_sketch_alloc wrap(const void *bytes, size_t size, uint64_t seed=DEFAULT_SEED, bool dump_on_error=false)
This method wraps a serialized compact sketch as an array of bytes.
Definition: theta_sketch_impl.hpp:754
Allocator get_allocator() const
Definition: theta_sketch_impl.hpp:759
const_iterator begin() const
Const iterator over hash values in this sketch.
Definition: theta_sketch_impl.hpp:789
const_iterator end() const
Const iterator pointing past the valid range.
Definition: theta_sketch_impl.hpp:794
const uint64_t MAX_THETA
max theta - signed max for compatibility with Java
Definition: theta_constants.hpp:36
DataSketches namespace.
Definition: binomial_bounds.hpp:38