Frequent Items sketch.
More...
#include <frequent_items_sketch.hpp>
|
| | frequent_items_sketch (uint8_t lg_max_map_size, uint8_t lg_start_map_size=LG_MIN_MAP_SIZE, const E &equal=E(), const A &allocator=A()) |
| | Construct this sketch with parameters lg_max_map_size and lg_start_map_size.
|
| |
| void | update (const T &item, W weight=1) |
| | Update this sketch with an item and a positive weight (frequency count).
|
| |
| void | update (T &&item, W weight=1) |
| | Update this sketch with an item and a positive weight (frequency count).
|
| |
| void | merge (const frequent_items_sketch &other) |
| | This function merges the other sketch into this one.
|
| |
| void | merge (frequent_items_sketch &&other) |
| | This function merges the other sketch into this one.
|
| |
| bool | is_empty () const |
| |
| uint32_t | get_num_active_items () const |
| |
| W | get_total_weight () const |
| | Returns the sum of the weights (frequencies) in the stream seen so far by the sketch.
|
| |
| W | get_estimate (const T &item) const |
| | Returns the estimate of the weight (frequency) of the given item.
|
| |
| W | get_lower_bound (const T &item) const |
| | Returns the guaranteed lower bound weight (frequency) of the given item.
|
| |
| W | get_upper_bound (const T &item) const |
| | Returns the guaranteed upper bound weight (frequency) of the given item.
|
| |
| W | get_maximum_error () const |
| |
| double | get_epsilon () const |
| | Returns epsilon value of this sketch.
|
| |
| vector_row | get_frequent_items (frequent_items_error_type err_type) const |
| | Returns an array of rows that include frequent items, estimates, upper and lower bounds given an error_type and using get_maximum_error() as a threshold.
|
| |
| vector_row | get_frequent_items (frequent_items_error_type err_type, W threshold) const |
| | Returns an array of rows that include frequent items, estimates, upper and lower bounds given an error_type and a threshold.
|
| |
| template<typename SerDe = serde<T>> |
| size_t | get_serialized_size_bytes (const SerDe &sd=SerDe()) const |
| | Computes size needed to serialize the current state of the sketch.
|
| |
| template<typename SerDe = serde<T>> |
| void | serialize (std::ostream &os, const SerDe &sd=SerDe()) const |
| | This method serializes the sketch into a given stream in a binary form.
|
| |
| template<typename SerDe = serde<T>> |
| vector_bytes | serialize (unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const |
| | This method serializes the sketch as a vector of bytes.
|
| |
| string< A > | to_string (bool print_items=false) const |
| | Returns a human readable summary of this sketch.
|
| |
|
| static double | get_epsilon (uint8_t lg_max_map_size) |
| | Returns epsilon used to compute a priori error.
|
| |
| static double | get_apriori_error (uint8_t lg_max_map_size, W estimated_total_weight) |
| | Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.
|
| |
| template<typename SerDe = serde<T>> |
| static frequent_items_sketch | deserialize (std::istream &is, const SerDe &sd=SerDe(), const E &equal=E(), const A &allocator=A()) |
| | This method deserializes a sketch from a given stream.
|
| |
| template<typename SerDe = serde<T>> |
| static frequent_items_sketch | deserialize (const void *bytes, size_t size, const SerDe &sd=SerDe(), const E &equal=E(), const A &allocator=A()) |
| | This method deserializes a sketch from a given array of bytes.
|
| |
template<typename T, typename W = uint64_t, typename H = std::hash<T>, typename E = std::equal_to<T>, typename A = std::allocator<T>>
class datasketches::frequent_items_sketch< T, W, H, E, A >
Frequent Items sketch.
Based on Java implementation here: https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ItemsSketch.java
- Author
- Alexander Saydakov
Sketch that may retain string values. For sketches containing strings, cross-language portability depends on using compatible string encodings. This class does not by itself enforce UTF-8 validity for all string inputs.
◆ frequent_items_sketch()
template<typename T , typename W , typename H , typename E , typename A >
| frequent_items_sketch |
( |
uint8_t |
lg_max_map_size, |
|
|
uint8_t |
lg_start_map_size = LG_MIN_MAP_SIZE, |
|
|
const E & |
equal = E(), |
|
|
const A & |
allocator = A() |
|
) |
| |
|
explicit |
Construct this sketch with parameters lg_max_map_size and lg_start_map_size.
- Parameters
-
| lg_max_map_size | Log2 of the physical size of the internal hash map managed by this sketch. The maximum capacity of this internal hash map is 0.75 times 2^lg_max_map_size. Both the ultimate accuracy and size of this sketch are functions of lg_max_map_size. |
| lg_start_map_size | Log2 of the starting physical size of the internal hash map managed by this sketch. |
| equal | instance of Equality operator |
| allocator | instance of an Allocator |
◆ update() [1/2]
template<typename T , typename W , typename H , typename E , typename A >
| void update |
( |
const T & |
item, |
|
|
W |
weight = 1 |
|
) |
| |
Update this sketch with an item and a positive weight (frequency count).
If cross-language portability is required, callers should ensure that the input string uses a compatible encoding (valid UTF-8).
- Parameters
-
| item | for which the weight should be increased (lvalue) |
| weight | the amount by which the weight of the item should be increased A count of zero is a no-op, and a negative count will throw an exception. |
◆ update() [2/2]
template<typename T , typename W , typename H , typename E , typename A >
| void update |
( |
T && |
item, |
|
|
W |
weight = 1 |
|
) |
| |
Update this sketch with an item and a positive weight (frequency count).
If cross-language portability is required, callers should ensure that the input string uses a compatible encoding (valid UTF-8).
- Parameters
-
| item | for which the weight should be increased (rvalue) |
| weight | the amount by which the weight of the item should be increased A count of zero is a no-op, and a negative count will throw an exception. |
◆ merge() [1/2]
template<typename T , typename W , typename H , typename E , typename A >
This function merges the other sketch into this one.
The other sketch may be of a different size. If sketches contain strings, callers are responsible for ensuring that both sketches were built using compatible string encodings.
- Parameters
-
| other | sketch to be merged into this (lvalue) |
◆ merge() [2/2]
template<typename T , typename W , typename H , typename E , typename A >
This function merges the other sketch into this one.
The other sketch may be of a different size. If sketches contain strings, callers are responsible for ensuring that both sketches were built using compatible string encodings.
- Parameters
-
| other | sketch to be merged into this (rvalue) |
◆ is_empty()
template<typename T , typename W , typename H , typename E , typename A >
- Returns
- true if this sketch is empty
◆ get_num_active_items()
template<typename T , typename W , typename H , typename E , typename A >
| uint32_t get_num_active_items |
( |
| ) |
const |
- Returns
- the number of active items in the sketch
◆ get_total_weight()
template<typename T , typename W , typename H , typename E , typename A >
| W get_total_weight |
( |
| ) |
const |
Returns the sum of the weights (frequencies) in the stream seen so far by the sketch.
- Returns
- the total weight of all items in the stream seen so far by the sketch
◆ get_estimate()
template<typename T , typename W , typename H , typename E , typename A >
| W get_estimate |
( |
const T & |
item | ) |
const |
Returns the estimate of the weight (frequency) of the given item.
Note: The true frequency of a item would be the sum of the counts as a result of the two update functions.
- Parameters
-
- Returns
- the estimate of the weight (frequency) of the given item
◆ get_lower_bound()
template<typename T , typename W , typename H , typename E , typename A >
| W get_lower_bound |
( |
const T & |
item | ) |
const |
Returns the guaranteed lower bound weight (frequency) of the given item.
- Parameters
-
- Returns
- the guaranteed lower bound weight of the given item. That is, a number which is guaranteed to be no larger than the real weight.
◆ get_upper_bound()
template<typename T , typename W , typename H , typename E , typename A >
| W get_upper_bound |
( |
const T & |
item | ) |
const |
Returns the guaranteed upper bound weight (frequency) of the given item.
- Parameters
-
- Returns
- the guaranteed upper bound weight of the given item. That is, a number which is guaranteed to be no smaller than the real frequency.
◆ get_maximum_error()
template<typename T , typename W , typename H , typename E , typename A >
| W get_maximum_error |
( |
| ) |
const |
- Returns
- An upper bound on the maximum error of get_estimate(item) for any item. This is equivalent to the maximum distance between the upper bound and the lower bound for any item.
◆ get_epsilon() [1/2]
template<typename T , typename W , typename H , typename E , typename A >
| double get_epsilon |
( |
| ) |
const |
Returns epsilon value of this sketch.
This is just the value 3.5 / max_map_size.
- Returns
- epsilon used by the sketch to compute error.
◆ get_epsilon() [2/2]
template<typename T , typename W , typename H , typename E , typename A >
| double get_epsilon |
( |
uint8_t |
lg_max_map_size | ) |
|
|
static |
Returns epsilon used to compute a priori error.
This is just the value 3.5 / maxMapSize.
- Parameters
-
| lg_max_map_size | the planned map size to be used when constructing this sketch. |
- Returns
- epsilon used to compute a priori error.
◆ get_apriori_error()
template<typename T , typename W , typename H , typename E , typename A >
| double get_apriori_error |
( |
uint8_t |
lg_max_map_size, |
|
|
W |
estimated_total_weight |
|
) |
| |
|
static |
Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.
- Parameters
-
| lg_max_map_size | the planned map size to be used when constructing this sketch. |
| estimated_total_weight | the estimated total stream weight. |
- Returns
- the estimated a priori error.
◆ get_frequent_items() [1/2]
template<typename T , typename W , typename H , typename E , typename A >
Returns an array of rows that include frequent items, estimates, upper and lower bounds given an error_type and using get_maximum_error() as a threshold.
The method first examines all active items in the sketch (items that have a counter).
If error_type = NO_FALSE_NEGATIVES, this will include an item in the result list if get_upper_bound(item) > threshold. There will be no false negatives, i.e., no Type II error. There may be items in the set with true frequencies less than the threshold (false positives).
If error_type = NO_FALSE_POSITIVES, this will include an item in the result list if get_lower_bound(item) > threshold. There will be no false positives, i.e., no Type I error. There may be items omitted from the set with true frequencies greater than the threshold (false negatives).
- Parameters
-
| err_type | determines whether no false positives or no false negatives are desired. |
- Returns
- an array of frequent items
◆ get_frequent_items() [2/2]
template<typename T , typename W , typename H , typename E , typename A >
Returns an array of rows that include frequent items, estimates, upper and lower bounds given an error_type and a threshold.
The method first examines all active items in the sketch (items that have a counter).
If error_type = NO_FALSE_NEGATIVES, this will include an item in the result list if get_upper_bound(item) > threshold. There will be no false negatives, i.e., no Type II error. There may be items in the set with true frequencies less than the threshold (false positives).
If error_type = NO_FALSE_POSITIVES, this will include an item in the result list if get_lower_bound(item) > threshold. There will be no false positives, i.e., no Type I error. There may be items omitted from the set with true frequencies greater than the threshold (false negatives).
- Parameters
-
| err_type | determines whether no false positives or no false negatives are desired. |
| threshold | to include items in the result list |
- Returns
- an array of frequent items
◆ get_serialized_size_bytes()
template<typename T , typename W , typename H , typename E , typename A >
template<typename SerDe >
| size_t get_serialized_size_bytes |
( |
const SerDe & |
sd = SerDe() | ) |
const |
Computes size needed to serialize the current state of the sketch.
This can be expensive since every item needs to be looked at.
- Parameters
-
- Returns
- size in bytes needed to serialize this sketch
◆ serialize() [1/2]
template<typename T , typename W , typename H , typename E , typename A >
template<typename SerDe >
| void serialize |
( |
std::ostream & |
os, |
|
|
const SerDe & |
sd = SerDe() |
|
) |
| const |
This method serializes the sketch into a given stream in a binary form.
- Parameters
-
| os | output stream |
| sd | instance of a SerDe |
◆ serialize() [2/2]
template<typename T , typename W = uint64_t, typename H = std::hash<T>, typename E = std::equal_to<T>, typename A = std::allocator<T>>
template<typename SerDe = serde<T>>
| vector_bytes serialize |
( |
unsigned |
header_size_bytes = 0, |
|
|
const SerDe & |
sd = SerDe() |
|
) |
| const |
This method serializes the sketch as a vector of bytes.
An optional header can be reserved in front of the sketch. It is a blank space of a given size. This header is used in Datasketches PostgreSQL extension.
- Parameters
-
| header_size_bytes | space to reserve in front of the sketch |
| sd | instance of a SerDe |
- Returns
- serialized sketch as a vector of bytes
◆ deserialize() [1/2]
template<typename T , typename W = uint64_t, typename H = std::hash<T>, typename E = std::equal_to<T>, typename A = std::allocator<T>>
template<typename SerDe = serde<T>>
| static frequent_items_sketch deserialize |
( |
std::istream & |
is, |
|
|
const SerDe & |
sd = SerDe(), |
|
|
const E & |
equal = E(), |
|
|
const A & |
allocator = A() |
|
) |
| |
|
static |
This method deserializes a sketch from a given stream.
- Parameters
-
| is | input stream |
| sd | instance of a SerDe |
| equal | instance of Equality operator |
| allocator | instance of an Allocator |
- Returns
- an instance of the sketch
◆ deserialize() [2/2]
template<typename T , typename W = uint64_t, typename H = std::hash<T>, typename E = std::equal_to<T>, typename A = std::allocator<T>>
template<typename SerDe = serde<T>>
| static frequent_items_sketch deserialize |
( |
const void * |
bytes, |
|
|
size_t |
size, |
|
|
const SerDe & |
sd = SerDe(), |
|
|
const E & |
equal = E(), |
|
|
const A & |
allocator = A() |
|
) |
| |
|
static |
This method deserializes a sketch from a given array of bytes.
- Parameters
-
| bytes | pointer to the array of bytes |
| size | the size of the array |
| sd | instance of a SerDe |
| equal | instance of Equality operator |
| allocator | instance of an Allocator |
- Returns
- an instance of the sketch
◆ to_string()
template<typename T , typename W , typename H , typename E , typename A >
| string< A > to_string |
( |
bool |
print_items = false | ) |
const |
Returns a human readable summary of this sketch.
- Parameters
-
| print_items | if true include the list of items retained by the sketch |
The documentation for this class was generated from the following files: