datasketches-cpp
count_min.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef COUNT_MIN_HPP_
21 #define COUNT_MIN_HPP_
22 
23 #include <iterator>
24 #include "common_defs.hpp"
25 
26 namespace datasketches {
27 
35 template <typename W,
36  typename Allocator = std::allocator<W>>
38  static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
39 public:
40  using allocator_type = Allocator;
41  using const_iterator = typename std::vector<W, Allocator>::const_iterator;
42 
53  count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
54 
58  uint8_t get_num_hashes() const;
59 
63  uint32_t get_num_buckets() const;
64 
68  uint64_t get_seed() const;
69 
75  double get_relative_error() const;
76 
81  W get_total_weight() const;
82 
93  static uint32_t suggest_num_buckets(double relative_error);
94 
104  static uint8_t suggest_num_hashes(double confidence);
105 
112  W get_estimate(uint64_t item) const;
113 
120  W get_estimate(int64_t item) const;
121 
128  W get_estimate(const std::string& item) const;
129 
138  W get_estimate(const void* item, size_t size) const;
139 
147  W get_upper_bound(const void* item, size_t size) const;
148 
155  W get_upper_bound(int64_t item) const;
156 
163  W get_upper_bound(uint64_t item) const;
164 
171  W get_upper_bound(const std::string& item) const;
172 
180  W get_lower_bound(const void* item, size_t size) const;
181 
188  W get_lower_bound(int64_t item) const;
189 
196  W get_lower_bound(uint64_t item) const;
197 
204  W get_lower_bound(const std::string& item) const;
205 
214  void update(const void* item, size_t size, W weight);
215 
221  void update(uint64_t item, W weight = 1);
222 
228  void update(int64_t item, W weight = 1);
229 
235  void update(const std::string& item, W weight = 1);
236 
241  void merge(const count_min_sketch& other_sketch);
242 
249  bool is_empty() const;
250 
255  string<Allocator> to_string() const;
256 
262  const_iterator begin() const;
263 
270  const_iterator end() const;
271 
272  /*
273  * The serialized sketch binary form has the following structure
274  * Byte 0:
275  * 1 - if and only if the sketch is empty
276  * 0 - otherwise
277  *
278  * Byte 1 (serial version), byte 2 (family id), byte 3 (flags):
279  * 00000001 - default for now.
280  *
281  * Bytes 4 - 7:
282  * uint8_t zero corresponding to ``empty''
283  *
284  * Byte 8:
285  * uint_8 for number of hash functions
286  *
287  * Bytes 9, 13
288  * 4 bytes : uint32 for number of buckets.
289  *
290  * Bytes 14, 15:
291  * seed_hash
292  *
293  * Byte 16:
294  * uint8_t zero corresponding to ``empty''
295  *
296  * All remaining bytes from 17-24 follow the pattern of
297  * Bytes 17-24:
298  * Sketch array entry
299  *
300 
301  0 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
302  ||is_empty|ser__ver|familyId| flags |xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|
303 
304  1 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
305  ||---------- _num_buckets -----------|num_hash|__seed__ __hash__|xxxxxxxx|
306 
307  2 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
308  ||---------------------------- total weight ----------------------------|
309 
310  3 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
311  ||---------------------------- sketch entries ---------------------------|
312  ...
313 
314  */
315 
316 
321  size_t get_serialized_size_bytes() const;
322 
327  void serialize(std::ostream& os) const;
328 
329  // This is a convenience alias for users
330  // The type returned by the following serialize method
331  using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
332 
340  vector_bytes serialize(unsigned header_size_bytes = 0) const;
341 
349  static count_min_sketch deserialize(std::istream& is, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
350 
359  static count_min_sketch deserialize(const void* bytes, size_t size, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
360 
364  allocator_type get_allocator() const;
365 
366 private:
367  Allocator _allocator;
368  uint8_t _num_hashes;
369  uint32_t _num_buckets;
370  std::vector<W, Allocator> _sketch_array; // the array stored by the sketch
371  uint64_t _seed;
372  W _total_weight;
373  std::vector<uint64_t> hash_seeds;
374 
375  enum flags {IS_EMPTY};
376  static const uint8_t PREAMBLE_LONGS_SHORT = 2; // Empty -> need second byte for sketch parameters
377  static const uint8_t PREAMBLE_LONGS_FULL = 3; // Not empty -> need (at least) third byte for total weight.
378  static const uint8_t SERIAL_VERSION_1 = 1;
379  static const uint8_t FAMILY_ID = 18;
380  static const uint8_t NULL_8 = 0;
381  static const uint32_t NULL_32 = 0;
382 
389  static void check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte);
390 
391  /*
392  * Obtain the hash values when inserting an item into the sketch.
393  * @param item pointer to the data item to be inserted into the sketch.
394  * @param size of the data in bytes
395  * @return vector of uint64_t which each represent the index to which `value' must update in the sketch
396  */
397  std::vector<uint64_t> get_hashes(const void* item, size_t size) const;
398 
399 };
400 
401 } /* namespace datasketches */
402 
403 #include "count_min_impl.hpp"
404 
405 #endif
C++ implementation of the CountMin sketch data structure of Cormode and Muthukrishnan.
Definition: count_min.hpp:37
static count_min_sketch deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
void serialize(std::ostream &os) const
This method serializes the sketch into a given stream in a binary form.
Definition: count_min_impl.hpp:270
static uint32_t suggest_num_buckets(double relative_error)
Suggests the number of buckets required to achieve the given relative error.
Definition: count_min_impl.hpp:86
const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition: count_min_impl.hpp:265
static uint8_t suggest_num_hashes(double confidence)
Suggests the number of hash functions required to achieve the given confidence.
Definition: count_min_impl.hpp:98
double get_relative_error() const
Definition: count_min_impl.hpp:76
uint32_t get_num_buckets() const
Definition: count_min_impl.hpp:66
bool is_empty() const
Returns true if this sketch is empty.
Definition: count_min_impl.hpp:443
W get_total_weight() const
Definition: count_min_impl.hpp:81
uint8_t get_num_hashes() const
Definition: count_min_impl.hpp:61
allocator_type get_allocator() const
W get_upper_bound(const void *item, size_t size) const
Query the sketch for the upper bound of a given item.
Definition: count_min_impl.hpp:209
W get_lower_bound(const void *item, size_t size) const
Query the sketch for the lower bound of a given item.
Definition: count_min_impl.hpp:226
W get_estimate(uint64_t item) const
Specific get_estimate function for uint64_t type see generic get_estimate function.
Definition: count_min_impl.hpp:143
count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed=DEFAULT_SEED, const Allocator &allocator=Allocator())
Creates an instance of the sketch given parameters _num_hashes, _num_buckets and hash seed,...
Definition: count_min_impl.hpp:35
uint64_t get_seed() const
Definition: count_min_impl.hpp:71
void merge(const count_min_sketch &other_sketch)
Merges another count_min_sketch into this count_min_sketch.
Definition: count_min_impl.hpp:231
void update(const void *item, size_t size, W weight)
Update this sketch with given data of any type.
Definition: count_min_impl.hpp:184
string< Allocator > to_string() const
Returns a string describing the sketch.
Definition: count_min_impl.hpp:448
size_t get_serialized_size_bytes() const
Computes size needed to serialize the current state of the sketch.
Definition: count_min_impl.hpp:341
const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition: count_min_impl.hpp:260
static count_min_sketch deserialize(const void *bytes, size_t size, uint64_t seed=DEFAULT_SEED, const Allocator &allocator=Allocator())
This method deserializes a sketch from a given array of bytes.
DataSketches namespace.
Definition: binomial_bounds.hpp:38