datasketches-cpp
cpc_sketch.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef CPC_SKETCH_HPP_
21 #define CPC_SKETCH_HPP_
22 
23 #include <iostream>
24 #include <functional>
25 #include <string>
26 #include <vector>
27 
28 #include "u32_table.hpp"
29 #include "cpc_common.hpp"
30 #include "cpc_compressor.hpp"
31 #include "cpc_confidence.hpp"
32 #include "common_defs.hpp"
33 
34 namespace datasketches {
35 
36 // forward declarations
37 template<typename A> class cpc_sketch_alloc;
38 template<typename A> class cpc_union_alloc;
39 
42 
51 template<typename A> void cpc_init();
52 
63 template<typename A>
65 public:
66  using allocator_type = A;
67  using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
68  using vector_u64 = std::vector<uint64_t, typename std::allocator_traits<A>::template rebind_alloc<uint64_t>>;
69 
76  explicit cpc_sketch_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
77 
79  A get_allocator() const;
80 
82  uint8_t get_lg_k() const;
83 
85  bool is_empty() const;
86 
88  double get_estimate() const;
89 
97  double get_lower_bound(unsigned kappa) const;
98 
106  double get_upper_bound(unsigned kappa) const;
107 
112  void update(const std::string& value);
113 
118  void update(uint64_t value);
119 
124  void update(int64_t value);
125 
131  void update(uint32_t value);
132 
138  void update(int32_t value);
139 
145  void update(uint16_t value);
146 
152  void update(int16_t value);
153 
159  void update(uint8_t value);
160 
166  void update(int8_t value);
167 
173  void update(double value);
174 
180  void update(float value);
181 
195  void update(const void* value, size_t size);
196 
201  string<A> to_string() const;
202 
207  void serialize(std::ostream& os) const;
208 
217  vector_bytes serialize(unsigned header_size_bytes = 0) const;
218 
226  static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
227 
236  static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
237 
247  static size_t get_max_serialized_size_bytes(uint8_t lg_k);
248 
250  uint32_t get_num_coupons() const;
251 
253  // this should catch some forms of corruption during serialization-deserialization
254  bool validate() const;
255 
256 private:
257  static const uint8_t SERIAL_VERSION = 1;
258  static const uint8_t FAMILY = 16;
259 
260  enum flags { IS_BIG_ENDIAN, IS_COMPRESSED, HAS_HIP, HAS_TABLE, HAS_WINDOW };
261 
262  // Note: except for brief transitional moments, these sketches always obey
263  // the following strict mapping between the flavor of a sketch and the
264  // number of coupons that it has collected
265  enum flavor {
266  EMPTY, // 0 == C < 1
267  SPARSE, // 1 <= C < 3K/32
268  HYBRID, // 3K/32 <= C < K/2
269  PINNED, // K/2 <= C < 27K/8 [NB: 27/8 = 3 + 3/8]
270  SLIDING // 27K/8 <= C
271  };
272 
273  uint8_t lg_k;
274  uint64_t seed;
275  bool was_merged; // is the sketch the result of merging?
276  uint32_t num_coupons; // the number of coupons collected so far
277 
278  u32_table<A> surprising_value_table;
279  vector_bytes sliding_window;
280  uint8_t window_offset; // derivable from num_coupons, but made explicit for speed
281  uint8_t first_interesting_column; // This is part of a speed optimization
282 
283  double kxp;
284  double hip_est_accum;
285 
286  // for deserialization and cpc_union::get_result()
287  cpc_sketch_alloc(uint8_t lg_k, uint32_t num_coupons, uint8_t first_interesting_column, u32_table<A>&& table,
288  vector_bytes&& window, bool has_hip, double kxp, double hip_est_accum, uint64_t seed);
289 
290  inline void row_col_update(uint32_t row_col);
291  inline void update_sparse(uint32_t row_col);
292  inline void update_windowed(uint32_t row_col);
293  inline void update_hip(uint32_t row_col);
294  void promote_sparse_to_windowed();
295  void move_window();
296  void refresh_kxp(const uint64_t* bit_matrix);
297 
298  friend double get_hip_confidence_lb<A>(const cpc_sketch_alloc<A>& sketch, int kappa);
299  friend double get_hip_confidence_ub<A>(const cpc_sketch_alloc<A>& sketch, int kappa);
300  friend double get_icon_confidence_lb<A>(const cpc_sketch_alloc<A>& sketch, int kappa);
301  friend double get_icon_confidence_ub<A>(const cpc_sketch_alloc<A>& sketch, int kappa);
302  double get_hip_estimate() const;
303  double get_icon_estimate() const;
304 
305  inline flavor determine_flavor() const;
306  static inline flavor determine_flavor(uint8_t lg_k, uint64_t c);
307 
308  static inline uint8_t determine_correct_offset(uint8_t lg_k, uint64_t c);
309 
310  // this produces a full-size k-by-64 bit matrix
311  vector_u64 build_bit_matrix() const;
312 
313  static uint8_t get_preamble_ints(uint32_t num_coupons, bool has_hip, bool has_table, bool has_window);
314  inline void write_hip(std::ostream& os) const;
315  inline size_t copy_hip_to_mem(void* dst) const;
316 
317  static void check_lg_k(uint8_t lg_k);
318 
319  friend cpc_compressor<A>;
320  friend cpc_union_alloc<A>;
321 };
322 
323 } /* namespace datasketches */
324 
325 #include "cpc_sketch_impl.hpp"
326 
327 #endif
High performance C++ implementation of Compressed Probabilistic Counting (CPC) Sketch.
Definition: cpc_sketch.hpp:64
void serialize(std::ostream &os) const
This method serializes the sketch into a given stream in a binary form.
Definition: cpc_sketch_impl.hpp:408
double get_estimate() const
Definition: cpc_sketch_impl.hpp:75
cpc_sketch_alloc(uint8_t lg_k=cpc_constants::DEFAULT_LG_K, uint64_t seed=DEFAULT_SEED, const A &allocator=A())
Creates an instance of the sketch given the lg_k parameter and hash seed.
Definition: cpc_sketch_impl.hpp:44
static cpc_sketch_alloc< A > deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const A &allocator=A())
This method deserializes a sketch from a given stream.
Definition: cpc_sketch_impl.hpp:519
void update(const std::string &value)
Update this sketch with a given string.
Definition: cpc_sketch_impl.hpp:109
bool is_empty() const
Definition: cpc_sketch_impl.hpp:70
static size_t get_max_serialized_size_bytes(uint8_t lg_k)
The actual size of a compressed CPC sketch has a small random variance, but the following empirically...
Definition: cpc_sketch_impl.hpp:713
uint8_t get_lg_k() const
Definition: cpc_sketch_impl.hpp:65
A get_allocator() const
Definition: cpc_sketch_impl.hpp:60
double get_lower_bound(unsigned kappa) const
Returns the approximate lower error bound given a parameter kappa (1, 2 or 3).
Definition: cpc_sketch_impl.hpp:91
double get_upper_bound(unsigned kappa) const
Returns the approximate upper error bound given a parameter kappa (1, 2 or 3).
Definition: cpc_sketch_impl.hpp:100
string< A > to_string() const
Returns a human-readable summary of this sketch.
Definition: cpc_sketch_impl.hpp:383
High performance C++ implementation of Compressed Probabilistic Counting (CPC) Union.
Definition: cpc_union.hpp:40
const uint8_t DEFAULT_LG_K
default log2 of K
Definition: cpc_common.hpp:36
DataSketches namespace.
Definition: binomial_bounds.hpp:38
void cpc_init()
Allocation and initialization of global decompression (decoding) tables.
Definition: cpc_sketch_impl.hpp:39