20 #ifndef _VAR_OPT_SKETCH_HPP_
21 #define _VAR_OPT_SKETCH_HPP_
24 #include "common_defs.hpp"
31 template<
typename A>
using AllocU8 =
typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
32 template<
typename A>
using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
37 struct subset_summary {
41 double total_sketch_weight;
44 template <
typename T,
typename A>
class var_opt_union;
47 namespace var_opt_constants {
51 const uint32_t
MAX_K = ((uint32_t) 1 << 31) - 2;
65 typename A = std::allocator<T>
81 const A& allocator = A());
117 void update(
const T& item,
double weight = 1.0);
125 void update(T&& item,
double weight = 1.0);
131 inline uint32_t
get_k()
const;
137 inline uint64_t
get_n()
const;
173 template<
typename TT = T,
typename SerDe = serde<T>,
typename std::enable_if<std::is_arithmetic<TT>::value,
int>::type = 0>
182 template<
typename TT = T,
typename SerDe = serde<T>,
typename std::enable_if<!std::is_arithmetic<TT>::value,
int>::type = 0>
187 using vector_bytes = vector_u8<A>;
197 template<
typename SerDe = serde<T>>
198 vector_bytes
serialize(
unsigned header_size_bytes = 0,
const SerDe& sd = SerDe())
const;
205 template<
typename SerDe = serde<T>>
206 void serialize(std::ostream& os,
const SerDe& sd = SerDe())
const;
215 template<
typename SerDe = serde<T>>
226 template<
typename SerDe = serde<T>>
245 class const_iterator;
252 const_iterator
begin()
const;
260 const_iterator
end()
const;
263 typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
264 typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
266 static const uint32_t MIN_LG_ARR_ITEMS = 3;
268 static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
269 static const uint8_t PREAMBLE_LONGS_WARMUP = 3;
270 static const uint8_t PREAMBLE_LONGS_FULL = 4;
271 static const uint8_t SER_VER = 2;
272 static const uint8_t FAMILY_ID = 13;
273 static const uint8_t EMPTY_FLAG_MASK = 4;
274 static const uint8_t GADGET_FLAG_MASK = 128;
277 constexpr
static const double DEFAULT_KAPPA = 2.0;
291 uint32_t curr_items_alloc_;
308 uint32_t num_marks_in_h_;
318 class weights_deleter;
321 var_opt_sketch(uint32_t k, resize_factor rf,
bool is_gadget,
const A& allocator);
322 var_opt_sketch(uint32_t k, uint32_t h, uint32_t m, uint32_t r, uint64_t n,
double total_wt_r, resize_factor rf,
323 uint32_t curr_items_alloc,
bool filled_data, std::unique_ptr<T, items_deleter> items,
324 std::unique_ptr<double, weights_deleter> weights, uint32_t num_marks_in_h,
325 std::unique_ptr<bool, marks_deleter> marks,
const A& allocator);
334 inline void update(O&& item,
double weight,
bool mark);
337 inline void update_warmup_phase(O&& item,
double weight,
bool mark);
340 inline void update_light(O&& item,
double weight,
bool mark);
343 inline void update_heavy_r_eq1(O&& item,
double weight,
bool mark);
346 inline void update_heavy_general(O&& item,
double weight,
bool mark);
348 inline double get_tau()
const;
349 inline double peek_min()
const;
350 inline bool is_marked(uint32_t idx)
const;
352 inline uint32_t pick_random_slot_in_r()
const;
353 inline uint32_t choose_delete_slot(
double wt_cand, uint32_t num_cand)
const;
354 inline uint32_t choose_weighted_delete_slot(
double wt_cand, uint32_t num_cand)
const;
357 inline void push(O&& item,
double wt,
bool mark);
358 inline void transition_from_warmup();
359 inline void convert_to_heap();
360 inline void restore_towards_leaves(uint32_t slot_in);
361 inline void restore_towards_root(uint32_t slot_in);
362 inline void pop_min_to_m_region();
363 void grow_candidate_set(
double wt_cands, uint32_t num_cands);
364 void decrease_k_by_1();
366 void force_set_k(uint32_t k);
367 void downsample_candidate_set(
double wt_cands, uint32_t num_cands);
368 inline void swap_values(uint32_t src, uint32_t dst);
369 void grow_data_arrays();
370 void allocate_data_arrays(uint32_t tgt_size,
bool use_marks);
373 static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
374 static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
375 static uint32_t validate_and_get_target_size(uint32_t preamble_longs, uint32_t k, uint64_t n,
376 uint32_t h, uint32_t r, resize_factor rf);
379 static uint32_t get_adjusted_size(uint32_t max_size, uint32_t resize_target);
380 static uint32_t starting_sub_multiple(uint32_t lg_target, uint32_t lg_rf, uint32_t lg_min);
381 static inline double pseudo_hypergeometric_ub_on_p(uint64_t n, uint32_t k,
double sampling_rate);
382 static inline double pseudo_hypergeometric_lb_on_p(uint64_t n, uint32_t k,
double sampling_rate);
383 static bool is_power_of_2(uint32_t v);
384 static uint32_t to_log_2(uint32_t v);
385 static inline uint32_t next_int(uint32_t max_value);
386 static inline double next_double_exclude_zero();
391 template<
typename T,
typename A>
394 using iterator_category = std::input_iterator_tag;
395 using value_type = std::pair<const T&, const double>;
396 using difference_type = void;
397 using pointer =
const return_value_holder<value_type>;
398 using reference =
const value_type;
400 const_iterator(
const const_iterator& other);
401 const_iterator& operator++();
402 const_iterator& operator++(
int);
403 bool operator==(
const const_iterator& other)
const;
404 bool operator!=(
const const_iterator& other)
const;
405 reference operator*()
const;
406 pointer operator->()
const;
419 bool get_mark()
const;
422 double cum_r_weight_;
425 const size_t final_idx_;
429 template<
typename T,
typename A>
430 class var_opt_sketch<T, A>::iterator {
432 using iterator_category = std::input_iterator_tag;
433 using value_type = std::pair<T&, double>;
434 using difference_type = void;
435 using pointer = return_value_holder<value_type>;
436 using reference = value_type;
438 iterator(
const iterator& other);
439 iterator& operator++();
440 iterator& operator++(
int);
441 bool operator==(
const iterator& other)
const;
442 bool operator!=(
const iterator& other)
const;
443 reference operator*();
444 pointer operator->();
448 friend class var_opt_union<T, A>;
452 iterator(
const var_opt_sketch<T, A>& sk,
bool is_end,
bool use_r_region);
454 bool get_mark()
const;
456 const var_opt_sketch<T, A>* sk_;
457 double cum_r_weight_;
460 const size_t final_idx_;
466 #include "var_opt_sketch_impl.hpp"
This sketch samples data from a stream of items.
Definition: var_opt_sketch.hpp:67
string< A > items_to_string() const
Prints the raw sketch items to a string.
Definition: var_opt_sketch_impl.hpp:735
const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition: var_opt_sketch_impl.hpp:1486
void update(const T &item, double weight=1.0)
Updates this sketch with the given data item with the given weight.
Definition: var_opt_sketch_impl.hpp:709
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.
var_opt_sketch & operator=(const var_opt_sketch &other)
Copy assignment.
Definition: var_opt_sketch_impl.hpp:217
bool is_empty() const
Returns true if the sketch is empty.
Definition: var_opt_sketch_impl.hpp:643
var_opt_sketch(uint32_t k, resize_factor rf=var_opt_constants::DEFAULT_RESIZE_FACTOR, const A &allocator=A())
Constructor.
Definition: var_opt_sketch_impl.hpp:46
subset_summary estimate_subset_sum(P predicate) const
Computes an estimated subset sum from the entire stream for objects matching a given predicate.
Definition: var_opt_sketch_impl.hpp:1381
static var_opt_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given array of bytes.
uint32_t get_num_samples() const
Returns the number of samples currently in the sketch.
Definition: var_opt_sketch_impl.hpp:703
void reset()
Resets the sketch to its default, empty state.
Definition: var_opt_sketch_impl.hpp:648
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition: var_opt_sketch_impl.hpp:297
const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition: var_opt_sketch_impl.hpp:1491
static var_opt_sketch deserialize(std::istream &is, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given stream.
string< A > to_string() const
Prints a summary of the sketch.
Definition: var_opt_sketch_impl.hpp:719
uint32_t get_k() const
Returns the configured maximum sample size.
Definition: var_opt_sketch_impl.hpp:698
uint64_t get_n() const
Returns the length of the input stream.
Definition: var_opt_sketch_impl.hpp:693
Provides a unioning operation over var_opt_sketch objects.
Definition: var_opt_union.hpp:52
const resize_factor DEFAULT_RESIZE_FACTOR
default resize factor
Definition: var_opt_sketch.hpp:49
const uint32_t MAX_K
maximum value of parameter K
Definition: var_opt_sketch.hpp:51
DataSketches namespace.
Definition: binomial_bounds.hpp:38