datasketches-cpp
Loading...
Searching...
No Matches
cpc_sketch.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef CPC_SKETCH_HPP_
21#define CPC_SKETCH_HPP_
22
23#include <iostream>
24#include <functional>
25#include <string>
26#include <vector>
27
28#include "u32_table.hpp"
29#include "cpc_common.hpp"
30#include "cpc_compressor.hpp"
31#include "cpc_confidence.hpp"
32#include "common_defs.hpp"
33
34namespace datasketches {
35
36// forward declarations
37template<typename A> class cpc_sketch_alloc;
38template<typename A> class cpc_union_alloc;
39
42
51template<typename A> void cpc_init();
52
63template<typename A>
65public:
66 using allocator_type = A;
67 using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
68 using vector_u64 = std::vector<uint64_t, typename std::allocator_traits<A>::template rebind_alloc<uint64_t>>;
69
76 explicit cpc_sketch_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
77
79 A get_allocator() const;
80
82 uint8_t get_lg_k() const;
83
85 bool is_empty() const;
86
88 double get_estimate() const;
89
97 double get_lower_bound(unsigned kappa) const;
98
106 double get_upper_bound(unsigned kappa) const;
107
112 void update(const std::string& value);
113
118 void update(uint64_t value);
119
124 void update(int64_t value);
125
131 void update(uint32_t value);
132
138 void update(int32_t value);
139
145 void update(uint16_t value);
146
152 void update(int16_t value);
153
159 void update(uint8_t value);
160
166 void update(int8_t value);
167
173 void update(double value);
174
180 void update(float value);
181
195 void update(const void* value, size_t size);
196
201 string<A> to_string() const;
202
207 void serialize(std::ostream& os) const;
208
217 vector_bytes serialize(unsigned header_size_bytes = 0) const;
218
226 static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
227
236 static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
237
247 static size_t get_max_serialized_size_bytes(uint8_t lg_k);
248
250 uint32_t get_num_coupons() const;
251
253 // this should catch some forms of corruption during serialization-deserialization
254 bool validate() const;
255
256private:
257 static const uint8_t SERIAL_VERSION = 1;
258 static const uint8_t FAMILY = 16;
259
260 enum flags { IS_BIG_ENDIAN, IS_COMPRESSED, HAS_HIP, HAS_TABLE, HAS_WINDOW };
261
262 // Note: except for brief transitional moments, these sketches always obey
263 // the following strict mapping between the flavor of a sketch and the
264 // number of coupons that it has collected
265 enum flavor {
266 EMPTY, // 0 == C < 1
267 SPARSE, // 1 <= C < 3K/32
268 HYBRID, // 3K/32 <= C < K/2
269 PINNED, // K/2 <= C < 27K/8 [NB: 27/8 = 3 + 3/8]
270 SLIDING // 27K/8 <= C
271 };
272
273 uint8_t lg_k;
274 uint64_t seed;
275 bool was_merged; // is the sketch the result of merging?
276 uint32_t num_coupons; // the number of coupons collected so far
277
278 u32_table<A> surprising_value_table;
279 vector_bytes sliding_window;
280 uint8_t window_offset; // derivable from num_coupons, but made explicit for speed
281 uint8_t first_interesting_column; // This is part of a speed optimization
282
283 double kxp;
284 double hip_est_accum;
285
286 // for deserialization and cpc_union::get_result()
287 cpc_sketch_alloc(uint8_t lg_k, uint32_t num_coupons, uint8_t first_interesting_column, u32_table<A>&& table,
288 vector_bytes&& window, bool has_hip, double kxp, double hip_est_accum, uint64_t seed);
289
290 inline void row_col_update(uint32_t row_col);
291 inline void update_sparse(uint32_t row_col);
292 inline void update_windowed(uint32_t row_col);
293 inline void update_hip(uint32_t row_col);
294 void promote_sparse_to_windowed();
295 void move_window();
296 void refresh_kxp(const uint64_t* bit_matrix);
297
298 friend double get_hip_confidence_lb<A>(const cpc_sketch_alloc<A>& sketch, int kappa);
299 friend double get_hip_confidence_ub<A>(const cpc_sketch_alloc<A>& sketch, int kappa);
300 friend double get_icon_confidence_lb<A>(const cpc_sketch_alloc<A>& sketch, int kappa);
301 friend double get_icon_confidence_ub<A>(const cpc_sketch_alloc<A>& sketch, int kappa);
302 double get_hip_estimate() const;
303 double get_icon_estimate() const;
304
305 inline flavor determine_flavor() const;
306 static inline flavor determine_flavor(uint8_t lg_k, uint64_t c);
307
308 static inline uint8_t determine_correct_offset(uint8_t lg_k, uint64_t c);
309
310 // this produces a full-size k-by-64 bit matrix
311 vector_u64 build_bit_matrix() const;
312
313 static uint8_t get_preamble_ints(uint32_t num_coupons, bool has_hip, bool has_table, bool has_window);
314 inline void write_hip(std::ostream& os) const;
315 inline size_t copy_hip_to_mem(void* dst) const;
316
317 static void check_lg_k(uint8_t lg_k);
318
319 friend cpc_compressor<A>;
320 friend cpc_union_alloc<A>;
321};
322
323} /* namespace datasketches */
324
325#include "cpc_sketch_impl.hpp"
326
327#endif
High performance C++ implementation of Compressed Probabilistic Counting (CPC) Sketch.
Definition cpc_sketch.hpp:64
void serialize(std::ostream &os) const
This method serializes the sketch into a given stream in a binary form.
Definition cpc_sketch_impl.hpp:408
double get_estimate() const
Definition cpc_sketch_impl.hpp:75
static cpc_sketch_alloc< A > deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const A &allocator=A())
This method deserializes a sketch from a given stream.
Definition cpc_sketch_impl.hpp:519
void update(const std::string &value)
Update this sketch with a given string.
Definition cpc_sketch_impl.hpp:109
bool is_empty() const
Definition cpc_sketch_impl.hpp:70
static size_t get_max_serialized_size_bytes(uint8_t lg_k)
The actual size of a compressed CPC sketch has a small random variance, but the following empirically...
Definition cpc_sketch_impl.hpp:713
uint8_t get_lg_k() const
Definition cpc_sketch_impl.hpp:65
A get_allocator() const
Definition cpc_sketch_impl.hpp:60
double get_lower_bound(unsigned kappa) const
Returns the approximate lower error bound given a parameter kappa (1, 2 or 3).
Definition cpc_sketch_impl.hpp:91
double get_upper_bound(unsigned kappa) const
Returns the approximate upper error bound given a parameter kappa (1, 2 or 3).
Definition cpc_sketch_impl.hpp:100
string< A > to_string() const
Returns a human-readable summary of this sketch.
Definition cpc_sketch_impl.hpp:383
High performance C++ implementation of Compressed Probabilistic Counting (CPC) Union.
Definition cpc_union.hpp:40
const uint8_t DEFAULT_LG_K
default log2 of K
Definition cpc_common.hpp:36
DataSketches namespace.
Definition binomial_bounds.hpp:38
void cpc_init()
Allocation and initialization of global decompression (decoding) tables.
Definition cpc_sketch_impl.hpp:39