datasketches-cpp
Loading...
Searching...
No Matches
count_min.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef COUNT_MIN_HPP_
21#define COUNT_MIN_HPP_
22
23#include <iterator>
24#include "common_defs.hpp"
25
26namespace datasketches {
27
35template <typename W,
36 typename Allocator = std::allocator<W>>
38 static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
39public:
40 using allocator_type = Allocator;
41 using const_iterator = typename std::vector<W, Allocator>::const_iterator;
42
53 count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
54
58 uint8_t get_num_hashes() const;
59
63 uint32_t get_num_buckets() const;
64
68 uint64_t get_seed() const;
69
75 double get_relative_error() const;
76
81 W get_total_weight() const;
82
93 static uint32_t suggest_num_buckets(double relative_error);
94
104 static uint8_t suggest_num_hashes(double confidence);
105
112 W get_estimate(uint64_t item) const;
113
120 W get_estimate(int64_t item) const;
121
128 W get_estimate(const std::string& item) const;
129
138 W get_estimate(const void* item, size_t size) const;
139
147 W get_upper_bound(const void* item, size_t size) const;
148
155 W get_upper_bound(int64_t item) const;
156
163 W get_upper_bound(uint64_t item) const;
164
171 W get_upper_bound(const std::string& item) const;
172
180 W get_lower_bound(const void* item, size_t size) const;
181
188 W get_lower_bound(int64_t item) const;
189
196 W get_lower_bound(uint64_t item) const;
197
204 W get_lower_bound(const std::string& item) const;
205
214 void update(const void* item, size_t size, W weight);
215
221 void update(uint64_t item, W weight = 1);
222
228 void update(int64_t item, W weight = 1);
229
235 void update(const std::string& item, W weight = 1);
236
241 void merge(const count_min_sketch& other_sketch);
242
249 bool is_empty() const;
250
255 string<Allocator> to_string() const;
256
262 const_iterator begin() const;
263
270 const_iterator end() const;
271
272 /*
273 * The serialized sketch binary form has the following structure
274 * Byte 0:
275 * 1 - if and only if the sketch is empty
276 * 0 - otherwise
277 *
278 * Byte 1 (serial version), byte 2 (family id), byte 3 (flags):
279 * 00000001 - default for now.
280 *
281 * Bytes 4 - 7:
282 * uint8_t zero corresponding to ``empty''
283 *
284 * Byte 8:
285 * uint_8 for number of hash functions
286 *
287 * Bytes 9, 13
288 * 4 bytes : uint32 for number of buckets.
289 *
290 * Bytes 14, 15:
291 * seed_hash
292 *
293 * Byte 16:
294 * uint8_t zero corresponding to ``empty''
295 *
296 * All remaining bytes from 17-24 follow the pattern of
297 * Bytes 17-24:
298 * Sketch array entry
299 *
300
301 0 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
302 ||is_empty|ser__ver|familyId| flags |xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|
303
304 1 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
305 ||---------- _num_buckets -----------|num_hash|__seed__ __hash__|xxxxxxxx|
306
307 2 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
308 ||---------------------------- total weight ----------------------------|
309
310 3 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
311 ||---------------------------- sketch entries ---------------------------|
312 ...
313
314 */
315
316
321 size_t get_serialized_size_bytes() const;
322
327 void serialize(std::ostream& os) const;
328
329 // This is a convenience alias for users
330 // The type returned by the following serialize method
331 using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
332
340 vector_bytes serialize(unsigned header_size_bytes = 0) const;
341
349 static count_min_sketch deserialize(std::istream& is, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
350
359 static count_min_sketch deserialize(const void* bytes, size_t size, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
360
364 allocator_type get_allocator() const;
365
366private:
367 Allocator _allocator;
368 uint8_t _num_hashes;
369 uint32_t _num_buckets;
370 std::vector<W, Allocator> _sketch_array; // the array stored by the sketch
371 uint64_t _seed;
372 W _total_weight;
373 std::vector<uint64_t> hash_seeds;
374
375 enum flags {IS_EMPTY};
376 static const uint8_t PREAMBLE_LONGS_SHORT = 2; // Empty -> need second byte for sketch parameters
377 static const uint8_t PREAMBLE_LONGS_FULL = 3; // Not empty -> need (at least) third byte for total weight.
378 static const uint8_t SERIAL_VERSION_1 = 1;
379 static const uint8_t FAMILY_ID = 18;
380 static const uint8_t NULL_8 = 0;
381 static const uint32_t NULL_32 = 0;
382
389 static void check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte);
390
391 /*
392 * Obtain the hash values when inserting an item into the sketch.
393 * @param item pointer to the data item to be inserted into the sketch.
394 * @param size of the data in bytes
395 * @return vector of uint64_t which each represent the index to which `value' must update in the sketch
396 */
397 std::vector<uint64_t> get_hashes(const void* item, size_t size) const;
398
399};
400
401} /* namespace datasketches */
402
403#include "count_min_impl.hpp"
404
405#endif
C++ implementation of the CountMin sketch data structure of Cormode and Muthukrishnan.
Definition count_min.hpp:37
static count_min_sketch deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
void serialize(std::ostream &os) const
This method serializes the sketch into a given stream in a binary form.
Definition count_min_impl.hpp:270
static uint32_t suggest_num_buckets(double relative_error)
Suggests the number of buckets required to achieve the given relative error.
Definition count_min_impl.hpp:86
const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition count_min_impl.hpp:265
static uint8_t suggest_num_hashes(double confidence)
Suggests the number of hash functions required to achieve the given confidence.
Definition count_min_impl.hpp:98
double get_relative_error() const
Definition count_min_impl.hpp:76
uint32_t get_num_buckets() const
Definition count_min_impl.hpp:66
bool is_empty() const
Returns true if this sketch is empty.
Definition count_min_impl.hpp:443
W get_total_weight() const
Definition count_min_impl.hpp:81
uint8_t get_num_hashes() const
Definition count_min_impl.hpp:61
allocator_type get_allocator() const
W get_upper_bound(const void *item, size_t size) const
Query the sketch for the upper bound of a given item.
Definition count_min_impl.hpp:209
W get_lower_bound(const void *item, size_t size) const
Query the sketch for the lower bound of a given item.
Definition count_min_impl.hpp:226
W get_estimate(uint64_t item) const
Specific get_estimate function for uint64_t type see generic get_estimate function.
Definition count_min_impl.hpp:143
uint64_t get_seed() const
Definition count_min_impl.hpp:71
void merge(const count_min_sketch &other_sketch)
Merges another count_min_sketch into this count_min_sketch.
Definition count_min_impl.hpp:231
void update(const void *item, size_t size, W weight)
Update this sketch with given data of any type.
Definition count_min_impl.hpp:184
string< Allocator > to_string() const
Returns a string describing the sketch.
Definition count_min_impl.hpp:448
size_t get_serialized_size_bytes() const
Computes size needed to serialize the current state of the sketch.
Definition count_min_impl.hpp:341
const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition count_min_impl.hpp:260
static count_min_sketch deserialize(const void *bytes, size_t size, uint64_t seed=DEFAULT_SEED, const Allocator &allocator=Allocator())
This method deserializes a sketch from a given array of bytes.
DataSketches namespace.
Definition binomial_bounds.hpp:38