|
| frequent_items_sketch (uint8_t lg_max_map_size, uint8_t lg_start_map_size=LG_MIN_MAP_SIZE, const E &equal=E(), const A &allocator=A()) |
| Construct this sketch with parameters lg_max_map_size and lg_start_map_size.
|
|
void | update (const T &item, W weight=1) |
| Update this sketch with an item and a positive weight (frequency count).
|
|
void | update (T &&item, W weight=1) |
| Update this sketch with an item and a positive weight (frequency count).
|
|
void | merge (const frequent_items_sketch &other) |
| This function merges the other sketch into this one.
|
|
void | merge (frequent_items_sketch &&other) |
| This function merges the other sketch into this one.
|
|
bool | is_empty () const |
|
uint32_t | get_num_active_items () const |
|
W | get_total_weight () const |
| Returns the sum of the weights (frequencies) in the stream seen so far by the sketch.
|
|
W | get_estimate (const T &item) const |
| Returns the estimate of the weight (frequency) of the given item.
|
|
W | get_lower_bound (const T &item) const |
| Returns the guaranteed lower bound weight (frequency) of the given item.
|
|
W | get_upper_bound (const T &item) const |
| Returns the guaranteed upper bound weight (frequency) of the given item.
|
|
W | get_maximum_error () const |
|
double | get_epsilon () const |
| Returns epsilon value of this sketch.
|
|
vector_row | get_frequent_items (frequent_items_error_type err_type) const |
| Returns an array of rows that include frequent items, estimates, upper and lower bounds given an error_type and using get_maximum_error() as a threshold.
|
|
vector_row | get_frequent_items (frequent_items_error_type err_type, W threshold) const |
| Returns an array of rows that include frequent items, estimates, upper and lower bounds given an error_type and a threshold.
|
|
template<typename SerDe = serde<T>> |
size_t | get_serialized_size_bytes (const SerDe &sd=SerDe()) const |
| Computes size needed to serialize the current state of the sketch.
|
|
template<typename SerDe = serde<T>> |
void | serialize (std::ostream &os, const SerDe &sd=SerDe()) const |
| This method serializes the sketch into a given stream in a binary form.
|
|
template<typename SerDe = serde<T>> |
vector_bytes | serialize (unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const |
| This method serializes the sketch as a vector of bytes.
|
|
string< A > | to_string (bool print_items=false) const |
| Returns a human readable summary of this sketch.
|
|
|
static double | get_epsilon (uint8_t lg_max_map_size) |
| Returns epsilon used to compute a priori error.
|
|
static double | get_apriori_error (uint8_t lg_max_map_size, W estimated_total_weight) |
| Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.
|
|
template<typename SerDe = serde<T>> |
static frequent_items_sketch | deserialize (std::istream &is, const SerDe &sd=SerDe(), const E &equal=E(), const A &allocator=A()) |
| This method deserializes a sketch from a given stream.
|
|
template<typename SerDe = serde<T>> |
static frequent_items_sketch | deserialize (const void *bytes, size_t size, const SerDe &sd=SerDe(), const E &equal=E(), const A &allocator=A()) |
| This method deserializes a sketch from a given array of bytes.
|
|
template<typename T, typename W = uint64_t, typename H = std::hash<T>, typename E = std::equal_to<T>, typename A = std::allocator<T>>
class datasketches::frequent_items_sketch< T, W, H, E, A >
Frequent Items sketch.
Based on Java implementation here: https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ItemsSketch.java
- Author
- Alexander Saydakov
template<typename T , typename W , typename H , typename E , typename A >
This function merges the other sketch into this one.
The other sketch may be of a different size.
- Parameters
-
other | sketch to be merged into this (lvalue) |
template<typename T , typename W , typename H , typename E , typename A >
This function merges the other sketch into this one.
The other sketch may be of a different size.
- Parameters
-
other | sketch to be merged into this (rvalue) |
template<typename T , typename W , typename H , typename E , typename A >
W get_estimate |
( |
const T & |
item | ) |
const |
Returns the estimate of the weight (frequency) of the given item.
Note: The true frequency of a item would be the sum of the counts as a result of the two update functions.
- Parameters
-
- Returns
- the estimate of the weight (frequency) of the given item
template<typename T , typename W , typename H , typename E , typename A >
double get_epsilon |
( |
| ) |
const |
Returns epsilon value of this sketch.
This is just the value 3.5 / max_map_size.
- Returns
- epsilon used by the sketch to compute error.
template<typename T , typename W , typename H , typename E , typename A >
double get_epsilon |
( |
uint8_t |
lg_max_map_size | ) |
|
|
static |
Returns epsilon used to compute a priori error.
This is just the value 3.5 / maxMapSize.
- Parameters
-
lg_max_map_size | the planned map size to be used when constructing this sketch. |
- Returns
- epsilon used to compute a priori error.
template<typename T , typename W , typename H , typename E , typename A >
Returns an array of rows that include frequent items, estimates, upper and lower bounds given an error_type and using get_maximum_error() as a threshold.
The method first examines all active items in the sketch (items that have a counter).
If error_type = NO_FALSE_NEGATIVES, this will include an item in the result list if get_upper_bound(item) > threshold. There will be no false negatives, i.e., no Type II error. There may be items in the set with true frequencies less than the threshold (false positives).
If error_type = NO_FALSE_POSITIVES, this will include an item in the result list if get_lower_bound(item) > threshold. There will be no false positives, i.e., no Type I error. There may be items omitted from the set with true frequencies greater than the threshold (false negatives).
- Parameters
-
err_type | determines whether no false positives or no false negatives are desired. |
- Returns
- an array of frequent items
template<typename T , typename W , typename H , typename E , typename A >
Returns an array of rows that include frequent items, estimates, upper and lower bounds given an error_type and a threshold.
The method first examines all active items in the sketch (items that have a counter).
If error_type = NO_FALSE_NEGATIVES, this will include an item in the result list if get_upper_bound(item) > threshold. There will be no false negatives, i.e., no Type II error. There may be items in the set with true frequencies less than the threshold (false positives).
If error_type = NO_FALSE_POSITIVES, this will include an item in the result list if get_lower_bound(item) > threshold. There will be no false positives, i.e., no Type I error. There may be items omitted from the set with true frequencies greater than the threshold (false negatives).
- Parameters
-
err_type | determines whether no false positives or no false negatives are desired. |
threshold | to include items in the result list |
- Returns
- an array of frequent items
template<typename T , typename W , typename H , typename E , typename A >
template<typename SerDe >
size_t get_serialized_size_bytes |
( |
const SerDe & |
sd = SerDe() | ) |
const |
Computes size needed to serialize the current state of the sketch.
This can be expensive since every item needs to be looked at.
- Parameters
-
- Returns
- size in bytes needed to serialize this sketch
template<typename T , typename W , typename H , typename E , typename A >
template<typename SerDe >
void serialize |
( |
std::ostream & |
os, |
|
|
const SerDe & |
sd = SerDe() |
|
) |
| const |
This method serializes the sketch into a given stream in a binary form.
- Parameters
-
os | output stream |
sd | instance of a SerDe |
template<typename T , typename W = uint64_t, typename H = std::hash<T>, typename E = std::equal_to<T>, typename A = std::allocator<T>>
template<typename SerDe = serde<T>>
vector_bytes serialize |
( |
unsigned |
header_size_bytes = 0 , |
|
|
const SerDe & |
sd = SerDe() |
|
) |
| const |
This method serializes the sketch as a vector of bytes.
An optional header can be reserved in front of the sketch. It is a blank space of a given size. This header is used in Datasketches PostgreSQL extension.
- Parameters
-
header_size_bytes | space to reserve in front of the sketch |
sd | instance of a SerDe |
- Returns
- serialized sketch as a vector of bytes
template<typename T , typename W = uint64_t, typename H = std::hash<T>, typename E = std::equal_to<T>, typename A = std::allocator<T>>
template<typename SerDe = serde<T>>
static frequent_items_sketch deserialize |
( |
std::istream & |
is, |
|
|
const SerDe & |
sd = SerDe() , |
|
|
const E & |
equal = E() , |
|
|
const A & |
allocator = A() |
|
) |
| |
|
static |
This method deserializes a sketch from a given stream.
- Parameters
-
is | input stream |
sd | instance of a SerDe |
equal | instance of Equality operator |
allocator | instance of an Allocator |
- Returns
- an instance of the sketch
template<typename T , typename W = uint64_t, typename H = std::hash<T>, typename E = std::equal_to<T>, typename A = std::allocator<T>>
template<typename SerDe = serde<T>>
static frequent_items_sketch deserialize |
( |
const void * |
bytes, |
|
|
size_t |
size, |
|
|
const SerDe & |
sd = SerDe() , |
|
|
const E & |
equal = E() , |
|
|
const A & |
allocator = A() |
|
) |
| |
|
static |
This method deserializes a sketch from a given array of bytes.
- Parameters
-
bytes | pointer to the array of bytes |
size | the size of the array |
sd | instance of a SerDe |
equal | instance of Equality operator |
allocator | instance of an Allocator |
- Returns
- an instance of the sketch