datasketches-cpp
frequent_items_sketch.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef FREQUENT_ITEMS_SKETCH_HPP_
21 #define FREQUENT_ITEMS_SKETCH_HPP_
22 
23 #include <memory>
24 #include <vector>
25 #include <iostream>
26 #include <functional>
27 #include <type_traits>
28 
29 #include "reverse_purge_hash_map.hpp"
30 #include "common_defs.hpp"
31 #include "serde.hpp"
32 
33 namespace datasketches {
34 
39 };
40 
48 template<
49  typename T,
50  typename W = uint64_t,
51  typename H = std::hash<T>,
52  typename E = std::equal_to<T>,
53  typename A = std::allocator<T>
54 >
56  static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
57 public:
58 
59  static const uint8_t LG_MIN_MAP_SIZE = 3;
60 
72  explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE,
73  const E& equal = E(), const A& allocator = A());
74 
81  void update(const T& item, W weight = 1);
82 
89  void update(T&& item, W weight = 1);
90 
96  void merge(const frequent_items_sketch& other);
97 
103  void merge(frequent_items_sketch&& other);
104 
108  bool is_empty() const;
109 
113  uint32_t get_num_active_items() const;
114 
120  W get_total_weight() const;
121 
130  W get_estimate(const T& item) const;
131 
139  W get_lower_bound(const T& item) const;
140 
148  W get_upper_bound(const T& item) const;
149 
155  W get_maximum_error() const;
156 
162  double get_epsilon() const;
163 
170  static double get_epsilon(uint8_t lg_max_map_size);
171 
179  static double get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight);
180 
181  class row;
182  using vector_row = typename std::vector<row, typename std::allocator_traits<A>::template rebind_alloc<row>>;
183 
205  vector_row get_frequent_items(frequent_items_error_type err_type) const;
206 
229  vector_row get_frequent_items(frequent_items_error_type err_type, W threshold) const;
230 
237  template<typename SerDe = serde<T>>
238  size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
239 
245  template<typename SerDe = serde<T>>
246  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
247 
248  // This is a convenience alias for users
249  // The type returned by the following serialize method
250  using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
251 
261  template<typename SerDe = serde<T>>
262  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
263 
272  template<typename SerDe = serde<T>>
273  static frequent_items_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
274  const E& equal = E(), const A& allocator = A());
275 
285  template<typename SerDe = serde<T>>
286  static frequent_items_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
287  const E& equal = E(), const A& allocator = A());
288 
293  string<A> to_string(bool print_items = false) const;
294 
295 private:
296  static const uint8_t SERIAL_VERSION = 1;
297  static const uint8_t FAMILY_ID = 10;
298  static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
299  static const uint8_t PREAMBLE_LONGS_NONEMPTY = 4;
300  static constexpr double EPSILON_FACTOR = 3.5;
301  // due to a mistake different bits were used in C++ and Java to indicate empty sketch
302  // therefore both are set and checked for compatibility with historical binary format
303  enum flags { IS_EMPTY_1 = 0, IS_EMPTY_2 = 2 };
304  W total_weight;
305  W offset;
306  reverse_purge_hash_map<T, W, H, E, A> map;
307  static void check_preamble_longs(uint8_t preamble_longs, bool is_empty);
308  static void check_serial_version(uint8_t serial_version);
309  static void check_family_id(uint8_t family_id);
310  static void check_size(uint8_t lg_cur_size, uint8_t lg_max_size);
311 
312  // version for integral signed type
313  template<typename WW = W, typename std::enable_if<std::is_integral<WW>::value && std::is_signed<WW>::value, int>::type = 0>
314  static inline void check_weight(WW weight);
315 
316  // version for integral unsigned type
317  template<typename WW = W, typename std::enable_if<std::is_integral<WW>::value && std::is_unsigned<WW>::value, int>::type = 0>
318  static inline void check_weight(WW weight);
319 
320  // version for floating point type
321  template<typename WW = W, typename std::enable_if<std::is_floating_point<WW>::value, int>::type = 0>
322  static inline void check_weight(WW weight);
323 
324  // for deserialize
325  class items_deleter;
326 };
327 
329 template<typename T, typename W, typename H, typename E, typename A>
330 class frequent_items_sketch<T, W, H, E, A>::row {
331 public:
332  row(const T* item, W weight, W offset):
333  item(item), weight(weight), offset(offset) {}
335  const T& get_item() const { return *item; }
337  W get_estimate() const { return weight + offset; }
339  W get_lower_bound() const { return weight; }
341  W get_upper_bound() const { return weight + offset; }
342 private:
343  const T* item;
344  W weight;
345  W offset;
346 };
347 
348 }
349 
350 #include "frequent_items_sketch_impl.hpp"
351 
352 # endif
Row in the output from get_frequent_items.
Definition: frequent_items_sketch.hpp:330
W get_upper_bound() const
Definition: frequent_items_sketch.hpp:341
W get_estimate() const
Definition: frequent_items_sketch.hpp:337
W get_lower_bound() const
Definition: frequent_items_sketch.hpp:339
const T & get_item() const
Definition: frequent_items_sketch.hpp:335
Frequent Items sketch.
Definition: frequent_items_sketch.hpp:55
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.
frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size=LG_MIN_MAP_SIZE, const E &equal=E(), const A &allocator=A())
Construct this sketch with parameters lg_max_map_size and lg_start_map_size.
Definition: frequent_items_sketch_impl.hpp:37
W get_upper_bound(const T &item) const
Returns the guaranteed upper bound weight (frequency) of the given item.
Definition: frequent_items_sketch_impl.hpp:118
void merge(const frequent_items_sketch &other)
This function merges the other sketch into this one.
Definition: frequent_items_sketch_impl.hpp:68
void serialize(std::ostream &os, const SerDe &sd=SerDe()) const
This method serializes the sketch into a given stream in a binary form.
Definition: frequent_items_sketch_impl.hpp:165
vector_row get_frequent_items(frequent_items_error_type err_type) const
Returns an array of rows that include frequent items, estimates, upper and lower bounds given an erro...
Definition: frequent_items_sketch_impl.hpp:144
bool is_empty() const
Definition: frequent_items_sketch_impl.hpp:90
string< A > to_string(bool print_items=false) const
Returns a human readable summary of this sketch.
Definition: frequent_items_sketch_impl.hpp:433
void update(const T &item, W weight=1)
Update this sketch with an item and a positive weight (frequency count).
Definition: frequent_items_sketch_impl.hpp:52
static double get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight)
Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_st...
Definition: frequent_items_sketch_impl.hpp:138
W get_total_weight() const
Returns the sum of the weights (frequencies) in the stream seen so far by the sketch.
Definition: frequent_items_sketch_impl.hpp:100
static frequent_items_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const E &equal=E(), const A &allocator=A())
This method deserializes a sketch from a given array of bytes.
double get_epsilon() const
Returns epsilon value of this sketch.
Definition: frequent_items_sketch_impl.hpp:128
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition: frequent_items_sketch_impl.hpp:212
W get_maximum_error() const
Definition: frequent_items_sketch_impl.hpp:123
W get_estimate(const T &item) const
Returns the estimate of the weight (frequency) of the given item.
Definition: frequent_items_sketch_impl.hpp:105
static frequent_items_sketch deserialize(std::istream &is, const SerDe &sd=SerDe(), const E &equal=E(), const A &allocator=A())
This method deserializes a sketch from a given stream.
W get_lower_bound(const T &item) const
Returns the guaranteed lower bound weight (frequency) of the given item.
Definition: frequent_items_sketch_impl.hpp:113
uint32_t get_num_active_items() const
Definition: frequent_items_sketch_impl.hpp:95
DataSketches namespace.
Definition: binomial_bounds.hpp:38
frequent_items_error_type
Frequent items error type.
Definition: frequent_items_sketch.hpp:36
@ NO_FALSE_NEGATIVES
include an item in the result list if get_upper_bound(item) > threshold
Definition: frequent_items_sketch.hpp:38
@ NO_FALSE_POSITIVES
include an item in the result list if get_lower_bound(item) > threshold
Definition: frequent_items_sketch.hpp:37