datasketches-cpp
Loading...
Searching...
No Matches
frequent_items_sketch.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef FREQUENT_ITEMS_SKETCH_HPP_
21#define FREQUENT_ITEMS_SKETCH_HPP_
22
23#include <memory>
24#include <vector>
25#include <iostream>
26#include <functional>
27#include <type_traits>
28
29#include "reverse_purge_hash_map.hpp"
30#include "common_defs.hpp"
31#include "serde.hpp"
32
33namespace datasketches {
34
40
48template<
49 typename T,
50 typename W = uint64_t,
51 typename H = std::hash<T>,
52 typename E = std::equal_to<T>,
53 typename A = std::allocator<T>
54>
56 static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
57public:
58
59 static const uint8_t LG_MIN_MAP_SIZE = 3;
60
72 explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE,
73 const E& equal = E(), const A& allocator = A());
74
81 void update(const T& item, W weight = 1);
82
89 void update(T&& item, W weight = 1);
90
96 void merge(const frequent_items_sketch& other);
97
103 void merge(frequent_items_sketch&& other);
104
108 bool is_empty() const;
109
113 uint32_t get_num_active_items() const;
114
120 W get_total_weight() const;
121
130 W get_estimate(const T& item) const;
131
139 W get_lower_bound(const T& item) const;
140
148 W get_upper_bound(const T& item) const;
149
155 W get_maximum_error() const;
156
162 double get_epsilon() const;
163
170 static double get_epsilon(uint8_t lg_max_map_size);
171
179 static double get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight);
180
181 class row;
182 using vector_row = typename std::vector<row, typename std::allocator_traits<A>::template rebind_alloc<row>>;
183
205 vector_row get_frequent_items(frequent_items_error_type err_type) const;
206
229 vector_row get_frequent_items(frequent_items_error_type err_type, W threshold) const;
230
237 template<typename SerDe = serde<T>>
238 size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
239
245 template<typename SerDe = serde<T>>
246 void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
247
248 // This is a convenience alias for users
249 // The type returned by the following serialize method
250 using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
251
261 template<typename SerDe = serde<T>>
262 vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
263
272 template<typename SerDe = serde<T>>
273 static frequent_items_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
274 const E& equal = E(), const A& allocator = A());
275
285 template<typename SerDe = serde<T>>
286 static frequent_items_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
287 const E& equal = E(), const A& allocator = A());
288
293 string<A> to_string(bool print_items = false) const;
294
295private:
296 static const uint8_t SERIAL_VERSION = 1;
297 static const uint8_t FAMILY_ID = 10;
298 static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
299 static const uint8_t PREAMBLE_LONGS_NONEMPTY = 4;
300 static constexpr double EPSILON_FACTOR = 3.5;
301 // due to a mistake different bits were used in C++ and Java to indicate empty sketch
302 // therefore both are set and checked for compatibility with historical binary format
303 enum flags { IS_EMPTY_1 = 0, IS_EMPTY_2 = 2 };
304 W total_weight;
305 W offset;
306 reverse_purge_hash_map<T, W, H, E, A> map;
307 static void check_preamble_longs(uint8_t preamble_longs, bool is_empty);
308 static void check_serial_version(uint8_t serial_version);
309 static void check_family_id(uint8_t family_id);
310 static void check_size(uint8_t lg_cur_size, uint8_t lg_max_size);
311
312 // version for integral signed type
313 template<typename WW = W, typename std::enable_if<std::is_integral<WW>::value && std::is_signed<WW>::value, int>::type = 0>
314 static inline void check_weight(WW weight);
315
316 // version for integral unsigned type
317 template<typename WW = W, typename std::enable_if<std::is_integral<WW>::value && std::is_unsigned<WW>::value, int>::type = 0>
318 static inline void check_weight(WW weight);
319
320 // version for floating point type
321 template<typename WW = W, typename std::enable_if<std::is_floating_point<WW>::value, int>::type = 0>
322 static inline void check_weight(WW weight);
323
324 // for deserialize
325 class items_deleter;
326};
327
329template<typename T, typename W, typename H, typename E, typename A>
330class frequent_items_sketch<T, W, H, E, A>::row {
331public:
332 row(const T* item, W weight, W offset):
333 item(item), weight(weight), offset(offset) {}
335 const T& get_item() const { return *item; }
337 W get_estimate() const { return weight + offset; }
339 W get_lower_bound() const { return weight; }
341 W get_upper_bound() const { return weight + offset; }
342private:
343 const T* item;
344 W weight;
345 W offset;
346};
347
348}
349
350#include "frequent_items_sketch_impl.hpp"
351
352# endif
Row in the output from get_frequent_items.
Definition frequent_items_sketch.hpp:330
W get_upper_bound() const
Definition frequent_items_sketch.hpp:341
W get_estimate() const
Definition frequent_items_sketch.hpp:337
const T & get_item() const
Definition frequent_items_sketch.hpp:335
W get_lower_bound() const
Definition frequent_items_sketch.hpp:339
Frequent Items sketch.
Definition frequent_items_sketch.hpp:55
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.
W get_upper_bound(const T &item) const
Returns the guaranteed upper bound weight (frequency) of the given item.
Definition frequent_items_sketch_impl.hpp:118
void merge(const frequent_items_sketch &other)
This function merges the other sketch into this one.
Definition frequent_items_sketch_impl.hpp:68
void serialize(std::ostream &os, const SerDe &sd=SerDe()) const
This method serializes the sketch into a given stream in a binary form.
Definition frequent_items_sketch_impl.hpp:165
vector_row get_frequent_items(frequent_items_error_type err_type) const
Returns an array of rows that include frequent items, estimates, upper and lower bounds given an erro...
Definition frequent_items_sketch_impl.hpp:144
bool is_empty() const
Definition frequent_items_sketch_impl.hpp:90
string< A > to_string(bool print_items=false) const
Returns a human readable summary of this sketch.
Definition frequent_items_sketch_impl.hpp:433
void update(const T &item, W weight=1)
Update this sketch with an item and a positive weight (frequency count).
Definition frequent_items_sketch_impl.hpp:52
static double get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight)
Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_st...
Definition frequent_items_sketch_impl.hpp:138
W get_total_weight() const
Returns the sum of the weights (frequencies) in the stream seen so far by the sketch.
Definition frequent_items_sketch_impl.hpp:100
static frequent_items_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const E &equal=E(), const A &allocator=A())
This method deserializes a sketch from a given array of bytes.
double get_epsilon() const
Returns epsilon value of this sketch.
Definition frequent_items_sketch_impl.hpp:128
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition frequent_items_sketch_impl.hpp:212
W get_maximum_error() const
Definition frequent_items_sketch_impl.hpp:123
W get_estimate(const T &item) const
Returns the estimate of the weight (frequency) of the given item.
Definition frequent_items_sketch_impl.hpp:105
static frequent_items_sketch deserialize(std::istream &is, const SerDe &sd=SerDe(), const E &equal=E(), const A &allocator=A())
This method deserializes a sketch from a given stream.
W get_lower_bound(const T &item) const
Returns the guaranteed lower bound weight (frequency) of the given item.
Definition frequent_items_sketch_impl.hpp:113
uint32_t get_num_active_items() const
Definition frequent_items_sketch_impl.hpp:95
DataSketches namespace.
Definition binomial_bounds.hpp:38
frequent_items_error_type
Frequent items error type.
Definition frequent_items_sketch.hpp:36
@ NO_FALSE_NEGATIVES
include an item in the result list if get_upper_bound(item) > threshold
Definition frequent_items_sketch.hpp:38
@ NO_FALSE_POSITIVES
include an item in the result list if get_lower_bound(item) > threshold
Definition frequent_items_sketch.hpp:37