datasketches-cpp
theta_sketch.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef THETA_SKETCH_HPP_
21 #define THETA_SKETCH_HPP_
22 
23 #include "theta_update_sketch_base.hpp"
24 #include "compact_theta_sketch_parser.hpp"
25 
26 namespace datasketches {
27 
28 // forward declarations
29 template<typename A> class theta_sketch_alloc;
30 template<typename A> class update_theta_sketch_alloc;
31 template<typename A> class compact_theta_sketch_alloc;
32 template<typename A> class wrapped_compact_theta_sketch_alloc;
33 
42 
44 template<typename Allocator = std::allocator<uint64_t>>
46 public:
47 
48  virtual ~base_theta_sketch_alloc() = default;
49 
53  virtual Allocator get_allocator() const = 0;
54 
58  virtual bool is_empty() const = 0;
59 
63  double get_estimate() const;
64 
72  double get_lower_bound(uint8_t num_std_devs) const;
73 
81  double get_upper_bound(uint8_t num_std_devs) const;
82 
86  bool is_estimation_mode() const;
87 
91  double get_theta() const;
92 
96  virtual uint64_t get_theta64() const = 0;
97 
101  virtual uint32_t get_num_retained() const = 0;
102 
106  virtual uint16_t get_seed_hash() const = 0;
107 
111  virtual bool is_ordered() const = 0;
112 
118  virtual string<Allocator> to_string(bool print_items = false) const;
119 
120 protected:
121  virtual void print_specifics(std::ostringstream& os) const = 0;
122  virtual void print_items(std::ostringstream& os) const = 0;
123 };
124 
126 template<typename Allocator = std::allocator<uint64_t>>
127 class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
128 public:
129  using Entry = uint64_t;
130  using ExtractKey = trivial_extract_key;
131  using iterator = theta_iterator<Entry, ExtractKey>;
132  using const_iterator = theta_const_iterator<Entry, ExtractKey>;
133 
134  virtual ~theta_sketch_alloc() = default;
135 
140  virtual iterator begin() = 0;
141 
147  virtual iterator end() = 0;
148 
153  virtual const_iterator begin() const = 0;
154 
160  virtual const_iterator end() const = 0;
161 
162 protected:
163  virtual void print_items(std::ostringstream& os) const;
164 };
165 
166 // forward declaration
167 template<typename A> class compact_theta_sketch_alloc;
168 
174 template<typename Allocator = std::allocator<uint64_t>>
176 public:
178  using Entry = typename Base::Entry;
179  using ExtractKey = typename Base::ExtractKey;
180  using iterator = typename Base::iterator;
181  using const_iterator = typename Base::const_iterator;
182  using theta_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
183  using resize_factor = typename theta_table::resize_factor;
184 
185  // No constructor here. Use builder instead.
186  class builder;
187 
193 
199 
200  virtual ~update_theta_sketch_alloc() = default;
201 
208 
215 
216  virtual Allocator get_allocator() const;
217  virtual bool is_empty() const;
218  virtual bool is_ordered() const;
219  virtual uint16_t get_seed_hash() const;
220  virtual uint64_t get_theta64() const;
221  virtual uint32_t get_num_retained() const;
222 
226  uint8_t get_lg_k() const;
227 
231  resize_factor get_rf() const;
232 
237  void update(const std::string& value);
238 
243  void update(uint64_t value);
244 
249  void update(int64_t value);
250 
256  void update(uint32_t value);
257 
263  void update(int32_t value);
264 
270  void update(uint16_t value);
271 
277  void update(int16_t value);
278 
284  void update(uint8_t value);
285 
291  void update(int8_t value);
292 
298  void update(double value);
299 
305  void update(float value);
306 
320  void update(const void* data, size_t length);
321 
325  void trim();
326 
330  void reset();
331 
337  compact_theta_sketch_alloc<Allocator> compact(bool ordered = true) const;
338 
339  virtual iterator begin();
340  virtual iterator end();
341  virtual const_iterator begin() const;
342  virtual const_iterator end() const;
343 
344 private:
345  theta_table table_;
346 
347  // for builder
348  update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
349  uint64_t theta, uint64_t seed, const Allocator& allocator);
350 
351  virtual void print_specifics(std::ostringstream& os) const;
352 };
353 
358 template<typename Allocator = std::allocator<uint64_t>>
360 public:
362  using iterator = typename Base::iterator;
363  using const_iterator = typename Base::const_iterator;
364  using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
365  using vector_bytes = std::vector<uint8_t, AllocBytes>;
366 
367  static const uint8_t UNCOMPRESSED_SERIAL_VERSION = 3;
368  static const uint8_t COMPRESSED_SERIAL_VERSION = 4;
369  static const uint8_t SKETCH_TYPE = 3;
370 
371  // Instances of this type can be obtained:
372  // - by compacting an update_theta_sketch_alloc
373  // - as a result of a set operation
374  // - by deserializing a previously serialized compact sketch
375 
382  template<typename Other>
383  compact_theta_sketch_alloc(const Other& other, bool ordered);
384 
390 
396 
397  virtual ~compact_theta_sketch_alloc() = default;
398 
405 
412 
413  virtual Allocator get_allocator() const;
414  virtual bool is_empty() const;
415  virtual bool is_ordered() const;
416  virtual uint64_t get_theta64() const;
417  virtual uint32_t get_num_retained() const;
418  virtual uint16_t get_seed_hash() const;
419 
424  static size_t get_max_serialized_size_bytes(uint8_t lg_k);
425 
432  size_t get_serialized_size_bytes(bool compressed = false) const;
433 
438  void serialize(std::ostream& os) const;
439 
447  vector_bytes serialize(unsigned header_size_bytes = 0) const;
448 
455  void serialize_compressed(std::ostream& os) const;
456 
466  vector_bytes serialize_compressed(unsigned header_size_bytes = 0) const;
467 
468  virtual iterator begin();
469  virtual iterator end();
470  virtual const_iterator begin() const;
471  virtual const_iterator end() const;
472 
480  static compact_theta_sketch_alloc deserialize(std::istream& is,
481  uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
482 
491  static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
492  uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
493 
494 private:
495  enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
496 
497  bool is_empty_;
498  bool is_ordered_;
499  uint16_t seed_hash_;
500  uint64_t theta_;
501  std::vector<uint64_t, Allocator> entries_;
502 
503  uint8_t get_preamble_longs(bool compressed) const;
504  bool is_suitable_for_compression() const;
505  uint8_t compute_entry_bits() const;
506  uint8_t get_num_entries_bytes() const;
507  size_t get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const;
508  void serialize_version_4(std::ostream& os) const;
509  vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
510 
511  static compact_theta_sketch_alloc deserialize_v1(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
512  static compact_theta_sketch_alloc deserialize_v2(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
513  static compact_theta_sketch_alloc deserialize_v3(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
514  static compact_theta_sketch_alloc deserialize_v4(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
515 
516  virtual void print_specifics(std::ostringstream& os) const;
517 
518  template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_union_base;
519  template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
520  template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
521  compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
522 };
523 
525 template<typename Allocator>
526 class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
527 public:
532  builder(const Allocator& allocator = Allocator());
534  update_theta_sketch_alloc build() const;
535 };
536 
542 template<typename Allocator = std::allocator<uint64_t>>
544 public:
545  class const_iterator;
546 
547  Allocator get_allocator() const;
548  bool is_empty() const;
549  bool is_ordered() const;
550  uint64_t get_theta64() const;
551  uint32_t get_num_retained() const;
552  uint16_t get_seed_hash() const;
553 
558  const_iterator begin() const;
559 
565  const_iterator end() const;
566 
575  static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
576 
577 protected:
578  virtual void print_specifics(std::ostringstream& os) const;
579  virtual void print_items(std::ostringstream& os) const;
580 
581 private:
582  using data_type = compact_theta_sketch_parser<true>::compact_theta_sketch_data;
583  data_type data_;
584 
585  wrapped_compact_theta_sketch_alloc(const data_type& data);
586 };
587 
588 template<typename Allocator>
589 class wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator {
590 public:
591  using iterator_category = std::input_iterator_tag;
592  using value_type = const uint64_t;
593  using difference_type = void;
594  using pointer = value_type*;
595  using reference = uint64_t;
596 
597  const_iterator(const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index);
598  const_iterator& operator++();
599  const_iterator operator++(int);
600  bool operator==(const const_iterator& other) const;
601  bool operator!=(const const_iterator& other) const;
602  reference operator*() const;
603  pointer operator->() const;
604 
605 private:
606  const void* ptr_;
607  uint8_t entry_bits_;
608  uint32_t num_entries_;
609  uint32_t index_;
610  uint64_t previous_;
611  bool is_block_mode_;
612  uint8_t buf_i_;
613  uint8_t offset_;
614  uint64_t buffer_[8];
615 };
616 
617 } /* namespace datasketches */
618 
619 #include "theta_sketch_impl.hpp"
620 
621 #endif
Abstract base class for Theta sketch.
Definition: theta_sketch.hpp:45
double get_estimate() const
Definition: theta_sketch_impl.hpp:47
double get_lower_bound(uint8_t num_std_devs) const
Returns the approximate lower error bound given a number of standard deviations.
Definition: theta_sketch_impl.hpp:52
virtual bool is_ordered() const =0
virtual bool is_empty() const =0
virtual string< Allocator > to_string(bool print_items=false) const
Provides a human-readable summary of this sketch as a string.
Definition: theta_sketch_impl.hpp:64
virtual uint32_t get_num_retained() const =0
double get_upper_bound(uint8_t num_std_devs) const
Returns the approximate upper error bound given a number of standard deviations.
Definition: theta_sketch_impl.hpp:58
double get_theta() const
Definition: theta_sketch_impl.hpp:41
virtual uint16_t get_seed_hash() const =0
bool is_estimation_mode() const
Definition: theta_sketch_impl.hpp:36
virtual Allocator get_allocator() const =0
virtual uint64_t get_theta64() const =0
Compact Theta sketch.
Definition: theta_sketch.hpp:359
compact_theta_sketch_alloc(const Other &other, bool ordered)
Copy constructor.
Definition: theta_sketch_impl.hpp:267
compact_theta_sketch_alloc(const compact_theta_sketch_alloc &other)=default
Copy constructor.
void serialize(std::ostream &os) const
This method serializes the sketch into a given stream in a binary form.
Definition: theta_sketch_impl.hpp:378
virtual uint64_t get_theta64() const
Definition: theta_sketch_impl.hpp:307
static compact_theta_sketch_alloc deserialize(const void *bytes, size_t size, uint64_t seed=DEFAULT_SEED, const Allocator &allocator=Allocator())
This method deserializes a sketch from a given array of bytes.
virtual uint32_t get_num_retained() const
Definition: theta_sketch_impl.hpp:312
virtual bool is_empty() const
Definition: theta_sketch_impl.hpp:297
virtual bool is_ordered() const
Definition: theta_sketch_impl.hpp:302
virtual uint16_t get_seed_hash() const
Definition: theta_sketch_impl.hpp:317
virtual iterator end()
Iterator pointing past the valid range.
Definition: theta_sketch_impl.hpp:327
static size_t get_max_serialized_size_bytes(uint8_t lg_k)
Computes maximum serialized size in bytes.
Definition: theta_sketch_impl.hpp:353
static compact_theta_sketch_alloc deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
virtual Allocator get_allocator() const
Definition: theta_sketch_impl.hpp:292
size_t get_serialized_size_bytes(bool compressed=false) const
Computes size in bytes required to serialize the current state of the sketch.
Definition: theta_sketch_impl.hpp:358
compact_theta_sketch_alloc(compact_theta_sketch_alloc &&other) noexcept=default
Move constructor.
compact_theta_sketch_alloc & operator=(compact_theta_sketch_alloc &&other)=default
Move assignment.
void serialize_compressed(std::ostream &os) const
This method serializes the sketch into a given stream in a compressed binary form.
Definition: theta_sketch_impl.hpp:435
virtual iterator begin()
Iterator over hash values in this sketch.
Definition: theta_sketch_impl.hpp:322
compact_theta_sketch_alloc & operator=(const compact_theta_sketch_alloc &other)=default
Copy assignment.
Theta base builder.
Definition: theta_update_sketch_base.hpp:97
virtual const_iterator begin() const =0
Const iterator over hash values in this sketch.
virtual const_iterator end() const =0
Const iterator pointing past the valid range.
virtual iterator end()=0
Iterator pointing past the valid range.
virtual iterator begin()=0
Iterator over hash values in this sketch.
Update Theta sketch builder.
Definition: theta_sketch.hpp:526
Update Theta sketch.
Definition: theta_sketch.hpp:175
void trim()
Remove retained entries in excess of the nominal size k (if any)
Definition: theta_sketch_impl.hpp:212
virtual uint64_t get_theta64() const
Definition: theta_sketch_impl.hpp:121
update_theta_sketch_alloc & operator=(const update_theta_sketch_alloc &other)=default
Copy assignment.
virtual uint32_t get_num_retained() const
Definition: theta_sketch_impl.hpp:126
resize_factor get_rf() const
Definition: theta_sketch_impl.hpp:141
void update(const std::string &value)
Update this sketch with a given string.
Definition: theta_sketch_impl.hpp:196
virtual bool is_empty() const
Definition: theta_sketch_impl.hpp:111
virtual bool is_ordered() const
Definition: theta_sketch_impl.hpp:116
virtual uint16_t get_seed_hash() const
Definition: theta_sketch_impl.hpp:131
update_theta_sketch_alloc(update_theta_sketch_alloc &&other) noexcept=default
Move constructor.
virtual iterator end()
Iterator pointing past the valid range.
Definition: theta_sketch_impl.hpp:227
uint8_t get_lg_k() const
Definition: theta_sketch_impl.hpp:136
virtual Allocator get_allocator() const
Definition: theta_sketch_impl.hpp:106
void reset()
Reset the sketch to the initial empty state.
Definition: theta_sketch_impl.hpp:217
virtual iterator begin()
Iterator over hash values in this sketch.
Definition: theta_sketch_impl.hpp:222
compact_theta_sketch_alloc< Allocator > compact(bool ordered=true) const
Converts this sketch to a compact sketch (ordered or unordered).
Definition: theta_sketch_impl.hpp:242
update_theta_sketch_alloc & operator=(update_theta_sketch_alloc &&other)=default
Move assignment.
update_theta_sketch_alloc(const update_theta_sketch_alloc &other)=default
Copy constructor.
Wrapped Compact Theta sketch.
Definition: theta_sketch.hpp:543
uint64_t get_theta64() const
Definition: theta_sketch_impl.hpp:774
uint32_t get_num_retained() const
Definition: theta_sketch_impl.hpp:779
bool is_empty() const
Definition: theta_sketch_impl.hpp:764
bool is_ordered() const
Definition: theta_sketch_impl.hpp:769
uint16_t get_seed_hash() const
Definition: theta_sketch_impl.hpp:784
static const wrapped_compact_theta_sketch_alloc wrap(const void *bytes, size_t size, uint64_t seed=DEFAULT_SEED, bool dump_on_error=false)
This method wraps a serialized compact sketch as an array of bytes.
Definition: theta_sketch_impl.hpp:754
Allocator get_allocator() const
Definition: theta_sketch_impl.hpp:759
const_iterator begin() const
Const iterator over hash values in this sketch.
Definition: theta_sketch_impl.hpp:789
const_iterator end() const
Const iterator pointing past the valid range.
Definition: theta_sketch_impl.hpp:794
DataSketches namespace.
Definition: binomial_bounds.hpp:38