datasketches-cpp
Loading...
Searching...
No Matches
frequent_items_sketch.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef FREQUENT_ITEMS_SKETCH_HPP_
21#define FREQUENT_ITEMS_SKETCH_HPP_
22
23#include <memory>
24#include <vector>
25#include <iostream>
26#include <functional>
27#include <type_traits>
28
29#include "reverse_purge_hash_map.hpp"
30#include "common_defs.hpp"
31#include "serde.hpp"
32
33namespace datasketches {
34
40
53template<
54 typename T,
55 typename W = uint64_t,
56 typename H = std::hash<T>,
57 typename E = std::equal_to<T>,
58 typename A = std::allocator<T>
59>
61 static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
62public:
63
64 static const uint8_t LG_MIN_MAP_SIZE = 3;
65
77 explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE,
78 const E& equal = E(), const A& allocator = A());
79
88 void update(const T& item, W weight = 1);
89
98 void update(T&& item, W weight = 1);
99
107 void merge(const frequent_items_sketch& other);
108
116 void merge(frequent_items_sketch&& other);
117
121 bool is_empty() const;
122
126 uint32_t get_num_active_items() const;
127
133 W get_total_weight() const;
134
143 W get_estimate(const T& item) const;
144
152 W get_lower_bound(const T& item) const;
153
161 W get_upper_bound(const T& item) const;
162
168 W get_maximum_error() const;
169
175 double get_epsilon() const;
176
183 static double get_epsilon(uint8_t lg_max_map_size);
184
192 static double get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight);
193
194 class row;
195 using vector_row = typename std::vector<row, typename std::allocator_traits<A>::template rebind_alloc<row>>;
196
218 vector_row get_frequent_items(frequent_items_error_type err_type) const;
219
242 vector_row get_frequent_items(frequent_items_error_type err_type, W threshold) const;
243
250 template<typename SerDe = serde<T>>
251 size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
252
258 template<typename SerDe = serde<T>>
259 void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
260
261 // This is a convenience alias for users
262 // The type returned by the following serialize method
263 using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
264
274 template<typename SerDe = serde<T>>
275 vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
276
285 template<typename SerDe = serde<T>>
286 static frequent_items_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
287 const E& equal = E(), const A& allocator = A());
288
298 template<typename SerDe = serde<T>>
299 static frequent_items_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
300 const E& equal = E(), const A& allocator = A());
301
306 string<A> to_string(bool print_items = false) const;
307
308private:
309 static const uint8_t SERIAL_VERSION = 1;
310 static const uint8_t FAMILY_ID = 10;
311 static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
312 static const uint8_t PREAMBLE_LONGS_NONEMPTY = 4;
313 static constexpr double EPSILON_FACTOR = 3.5;
314 // due to a mistake different bits were used in C++ and Java to indicate empty sketch
315 // therefore both are set and checked for compatibility with historical binary format
316 enum flags { IS_EMPTY_1 = 0, IS_EMPTY_2 = 2 };
317 W total_weight;
318 W offset;
319 reverse_purge_hash_map<T, W, H, E, A> map;
320 static void check_preamble_longs(uint8_t preamble_longs, bool is_empty);
321 static void check_serial_version(uint8_t serial_version);
322 static void check_family_id(uint8_t family_id);
323 static void check_size(uint8_t lg_cur_size, uint8_t lg_max_size);
324
325 // version for integral signed type
326 template<typename WW = W, typename std::enable_if<std::is_integral<WW>::value && std::is_signed<WW>::value, int>::type = 0>
327 static inline void check_weight(WW weight);
328
329 // version for integral unsigned type
330 template<typename WW = W, typename std::enable_if<std::is_integral<WW>::value && std::is_unsigned<WW>::value, int>::type = 0>
331 static inline void check_weight(WW weight);
332
333 // version for floating point type
334 template<typename WW = W, typename std::enable_if<std::is_floating_point<WW>::value, int>::type = 0>
335 static inline void check_weight(WW weight);
336
337 // for deserialize
338 class items_deleter;
339};
340
342template<typename T, typename W, typename H, typename E, typename A>
343class frequent_items_sketch<T, W, H, E, A>::row {
344public:
345 row(const T* item, W weight, W offset):
346 item(item), weight(weight), offset(offset) {}
348 const T& get_item() const { return *item; }
350 W get_estimate() const { return weight + offset; }
352 W get_lower_bound() const { return weight; }
354 W get_upper_bound() const { return weight + offset; }
355private:
356 const T* item;
357 W weight;
358 W offset;
359};
360
361}
362
363#include "frequent_items_sketch_impl.hpp"
364
365# endif
Row in the output from get_frequent_items.
Definition frequent_items_sketch.hpp:343
W get_upper_bound() const
Definition frequent_items_sketch.hpp:354
W get_estimate() const
Definition frequent_items_sketch.hpp:350
const T & get_item() const
Definition frequent_items_sketch.hpp:348
W get_lower_bound() const
Definition frequent_items_sketch.hpp:352
Frequent Items sketch.
Definition frequent_items_sketch.hpp:60
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.
W get_upper_bound(const T &item) const
Returns the guaranteed upper bound weight (frequency) of the given item.
Definition frequent_items_sketch_impl.hpp:118
void merge(const frequent_items_sketch &other)
This function merges the other sketch into this one.
Definition frequent_items_sketch_impl.hpp:68
void serialize(std::ostream &os, const SerDe &sd=SerDe()) const
This method serializes the sketch into a given stream in a binary form.
Definition frequent_items_sketch_impl.hpp:165
vector_row get_frequent_items(frequent_items_error_type err_type) const
Returns an array of rows that include frequent items, estimates, upper and lower bounds given an erro...
Definition frequent_items_sketch_impl.hpp:144
bool is_empty() const
Definition frequent_items_sketch_impl.hpp:90
string< A > to_string(bool print_items=false) const
Returns a human readable summary of this sketch.
Definition frequent_items_sketch_impl.hpp:432
void update(const T &item, W weight=1)
Update this sketch with an item and a positive weight (frequency count).
Definition frequent_items_sketch_impl.hpp:52
static double get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight)
Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_st...
Definition frequent_items_sketch_impl.hpp:138
W get_total_weight() const
Returns the sum of the weights (frequencies) in the stream seen so far by the sketch.
Definition frequent_items_sketch_impl.hpp:100
static frequent_items_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const E &equal=E(), const A &allocator=A())
This method deserializes a sketch from a given array of bytes.
double get_epsilon() const
Returns epsilon value of this sketch.
Definition frequent_items_sketch_impl.hpp:128
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition frequent_items_sketch_impl.hpp:212
W get_maximum_error() const
Definition frequent_items_sketch_impl.hpp:123
W get_estimate(const T &item) const
Returns the estimate of the weight (frequency) of the given item.
Definition frequent_items_sketch_impl.hpp:105
static frequent_items_sketch deserialize(std::istream &is, const SerDe &sd=SerDe(), const E &equal=E(), const A &allocator=A())
This method deserializes a sketch from a given stream.
W get_lower_bound(const T &item) const
Returns the guaranteed lower bound weight (frequency) of the given item.
Definition frequent_items_sketch_impl.hpp:113
uint32_t get_num_active_items() const
Definition frequent_items_sketch_impl.hpp:95
DataSketches namespace.
Definition binomial_bounds.hpp:38
frequent_items_error_type
Frequent items error type.
Definition frequent_items_sketch.hpp:36
@ NO_FALSE_NEGATIVES
include an item in the result list if get_upper_bound(item) > threshold
Definition frequent_items_sketch.hpp:38
@ NO_FALSE_POSITIVES
include an item in the result list if get_lower_bound(item) > threshold
Definition frequent_items_sketch.hpp:37