datasketches-cpp
var_opt_sketch.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef _VAR_OPT_SKETCH_HPP_
21 #define _VAR_OPT_SKETCH_HPP_
22 
23 #include "serde.hpp"
24 #include "common_defs.hpp"
25 
26 #include <iterator>
27 #include <vector>
28 
29 namespace datasketches {
30 
31 template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
32 template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
33 
34 /*
35  * A struct to hold the result of subset sum queries
36  */
37 struct subset_summary {
38  double lower_bound;
39  double estimate;
40  double upper_bound;
41  double total_sketch_weight;
42 };
43 
44 template <typename T, typename A> class var_opt_union; // forward declaration
45 
47 namespace var_opt_constants {
49  const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
51  const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
52 }
53 
63 template<
64  typename T,
65  typename A = std::allocator<T>
66 >
68 
69  public:
70  static const resize_factor DEFAULT_RESIZE_FACTOR = var_opt_constants::DEFAULT_RESIZE_FACTOR;
71  static const uint32_t MAX_K = var_opt_constants::MAX_K;
72 
79  explicit var_opt_sketch(uint32_t k,
81  const A& allocator = A());
82 
87  var_opt_sketch(const var_opt_sketch& other);
88 
93  var_opt_sketch(var_opt_sketch&& other) noexcept;
94 
95  ~var_opt_sketch();
96 
102  var_opt_sketch& operator=(const var_opt_sketch& other);
103 
110 
117  void update(const T& item, double weight = 1.0);
118 
125  void update(T&& item, double weight = 1.0);
126 
131  inline uint32_t get_k() const;
132 
137  inline uint64_t get_n() const;
138 
143  inline uint32_t get_num_samples() const;
144 
153  template<typename P>
154  subset_summary estimate_subset_sum(P predicate) const;
155 
160  inline bool is_empty() const;
161 
165  void reset();
166 
173  template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
174  inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
175 
182  template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
183  inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
184 
185  // This is a convenience alias for users
186  // The type returned by the following serialize method
187  using vector_bytes = vector_u8<A>;
188 
197  template<typename SerDe = serde<T>>
198  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
199 
205  template<typename SerDe = serde<T>>
206  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
207 
215  template<typename SerDe = serde<T>>
216  static var_opt_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
217 
226  template<typename SerDe = serde<T>>
227  static var_opt_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
228 
233  string<A> to_string() const;
234 
243  string<A> items_to_string() const;
244 
245  class const_iterator;
246 
252  const_iterator begin() const;
253 
260  const_iterator end() const;
261 
262  private:
263  typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
264  typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
265 
266  static const uint32_t MIN_LG_ARR_ITEMS = 3;
267 
268  static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
269  static const uint8_t PREAMBLE_LONGS_WARMUP = 3;
270  static const uint8_t PREAMBLE_LONGS_FULL = 4;
271  static const uint8_t SER_VER = 2;
272  static const uint8_t FAMILY_ID = 13;
273  static const uint8_t EMPTY_FLAG_MASK = 4;
274  static const uint8_t GADGET_FLAG_MASK = 128;
275 
276  // Number of standard deviations to use for subset sum error bounds
277  constexpr static const double DEFAULT_KAPPA = 2.0;
278 
279  // TODO: should probably rearrange a bit to minimize gaps once aligned
280  uint32_t k_; // max size of sketch, in items
281 
282  uint32_t h_; // number of items in heap
283  uint32_t m_; // number of items in middle region
284  uint32_t r_; // number of items in reservoir-like region
285 
286  uint64_t n_; // total number of items processed by sketch
287  double total_wt_r_; // total weight of items in reservoir-like area
288 
289  resize_factor rf_; // resize factor
290 
291  uint32_t curr_items_alloc_; // currently allocated array size
292  bool filled_data_; // true if we've explicitly set all entries in data_
293 
294  A allocator_;
295  T* data_; // stored sampled items
296  double* weights_; // weights for sampled items
297 
298  // The next two fields are hidden from the user because they are part of the state of the
299  // unioning algorithm, NOT part of a varopt sketch, or even of a varopt "gadget" (our name for
300  // the potentially invalid sketch that is maintained by the unioning algorithm). It would make
301  // more sense logically for these fields to be declared in the unioning object (whose entire
302  // purpose is storing the state of the unioning algorithm) but for reasons of programming
303  // convenience we are currently declaring them here. However, that could change in the future.
304 
305  // Following int is:
306  // 1. Zero (for a varopt sketch)
307  // 2. Count of marked items in H region, if part of a unioning algo's gadget
308  uint32_t num_marks_in_h_;
309 
310  // The following array is absent in a varopt sketch, and notionally present in a gadget
311  // (although it really belongs in the unioning object). If the array were to be made explicit,
312  // some additional coding would need to be done to ensure that all of the necessary data motion
313  // occurs and is properly tracked.
314  bool* marks_;
315 
316  // used during deserialization to avoid memory leaks upon errors
317  class items_deleter;
318  class weights_deleter;
319  class marks_deleter;
320 
321  var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget, const A& allocator);
322  var_opt_sketch(uint32_t k, uint32_t h, uint32_t m, uint32_t r, uint64_t n, double total_wt_r, resize_factor rf,
323  uint32_t curr_items_alloc, bool filled_data, std::unique_ptr<T, items_deleter> items,
324  std::unique_ptr<double, weights_deleter> weights, uint32_t num_marks_in_h,
325  std::unique_ptr<bool, marks_deleter> marks, const A& allocator);
326 
327  friend class var_opt_union<T, A>;
328  var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n);
329 
330  string<A> items_to_string(bool print_gap) const;
331 
332  // internal-use-only update
333  template<typename O>
334  inline void update(O&& item, double weight, bool mark);
335 
336  template<typename O>
337  inline void update_warmup_phase(O&& item, double weight, bool mark);
338 
339  template<typename O>
340  inline void update_light(O&& item, double weight, bool mark);
341 
342  template<typename O>
343  inline void update_heavy_r_eq1(O&& item, double weight, bool mark);
344 
345  template<typename O>
346  inline void update_heavy_general(O&& item, double weight, bool mark);
347 
348  inline double get_tau() const;
349  inline double peek_min() const;
350  inline bool is_marked(uint32_t idx) const;
351 
352  inline uint32_t pick_random_slot_in_r() const;
353  inline uint32_t choose_delete_slot(double wt_cand, uint32_t num_cand) const;
354  inline uint32_t choose_weighted_delete_slot(double wt_cand, uint32_t num_cand) const;
355 
356  template<typename O>
357  inline void push(O&& item, double wt, bool mark);
358  inline void transition_from_warmup();
359  inline void convert_to_heap();
360  inline void restore_towards_leaves(uint32_t slot_in);
361  inline void restore_towards_root(uint32_t slot_in);
362  inline void pop_min_to_m_region();
363  void grow_candidate_set(double wt_cands, uint32_t num_cands);
364  void decrease_k_by_1();
365  void strip_marks();
366  void force_set_k(uint32_t k); // used to resolve union gadget into sketch
367  void downsample_candidate_set(double wt_cands, uint32_t num_cands);
368  inline void swap_values(uint32_t src, uint32_t dst);
369  void grow_data_arrays();
370  void allocate_data_arrays(uint32_t tgt_size, bool use_marks);
371 
372  // validation
373  static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
374  static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
375  static uint32_t validate_and_get_target_size(uint32_t preamble_longs, uint32_t k, uint64_t n,
376  uint32_t h, uint32_t r, resize_factor rf);
377 
378  // things to move to common and be shared among sketches
379  static uint32_t get_adjusted_size(uint32_t max_size, uint32_t resize_target);
380  static uint32_t starting_sub_multiple(uint32_t lg_target, uint32_t lg_rf, uint32_t lg_min);
381  static inline double pseudo_hypergeometric_ub_on_p(uint64_t n, uint32_t k, double sampling_rate);
382  static inline double pseudo_hypergeometric_lb_on_p(uint64_t n, uint32_t k, double sampling_rate);
383  static bool is_power_of_2(uint32_t v);
384  static uint32_t to_log_2(uint32_t v);
385  static inline uint32_t next_int(uint32_t max_value);
386  static inline double next_double_exclude_zero();
387 
388  class iterator;
389 };
390 
391 template<typename T, typename A>
392 class var_opt_sketch<T, A>::const_iterator {
393 public:
394  using iterator_category = std::input_iterator_tag;
395  using value_type = std::pair<const T&, const double>;
396  using difference_type = void;
397  using pointer = const return_value_holder<value_type>;
398  using reference = const value_type;
399 
400  const_iterator(const const_iterator& other);
401  const_iterator& operator++();
402  const_iterator& operator++(int);
403  bool operator==(const const_iterator& other) const;
404  bool operator!=(const const_iterator& other) const;
405  reference operator*() const;
406  pointer operator->() const;
407 
408 private:
409  friend class var_opt_sketch<T, A>;
410  friend class var_opt_union<T, A>;
411 
412  // default iterator over full sketch
413  const_iterator(const var_opt_sketch<T, A>& sk, bool is_end);
414 
415  // iterates over only one of the H or R regions
416  // does not apply weight correction
417  const_iterator(const var_opt_sketch<T, A>& sk, bool is_end, bool use_r_region);
418 
419  bool get_mark() const;
420 
421  const var_opt_sketch<T, A>* sk_;
422  double cum_r_weight_; // used for weight correction
423  double r_item_wt_;
424  size_t idx_;
425  const size_t final_idx_;
426 };
427 
428 // non-const iterator for internal use
429 template<typename T, typename A>
430 class var_opt_sketch<T, A>::iterator {
431 public:
432  using iterator_category = std::input_iterator_tag;
433  using value_type = std::pair<T&, double>;
434  using difference_type = void;
435  using pointer = return_value_holder<value_type>;
436  using reference = value_type;
437 
438  iterator(const iterator& other);
439  iterator& operator++();
440  iterator& operator++(int);
441  bool operator==(const iterator& other) const;
442  bool operator!=(const iterator& other) const;
443  reference operator*();
444  pointer operator->();
445 
446 private:
447  friend class var_opt_sketch<T, A>;
448  friend class var_opt_union<T, A>;
449 
450  // iterates over only one of the H or R region, applying weight correction
451  // if iterating over R region (can correct for numerical precision issues)
452  iterator(const var_opt_sketch<T, A>& sk, bool is_end, bool use_r_region);
453 
454  bool get_mark() const;
455 
456  const var_opt_sketch<T, A>* sk_;
457  double cum_r_weight_; // used for weight correction
458  double r_item_wt_;
459  size_t idx_;
460  const size_t final_idx_;
461 };
462 
463 
464 } // namespace datasketches
465 
466 #include "var_opt_sketch_impl.hpp"
467 
468 #endif // _VAR_OPT_SKETCH_HPP_
This sketch samples data from a stream of items.
Definition: var_opt_sketch.hpp:67
string< A > items_to_string() const
Prints the raw sketch items to a string.
Definition: var_opt_sketch_impl.hpp:735
const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition: var_opt_sketch_impl.hpp:1486
void update(const T &item, double weight=1.0)
Updates this sketch with the given data item with the given weight.
Definition: var_opt_sketch_impl.hpp:709
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.
var_opt_sketch & operator=(const var_opt_sketch &other)
Copy assignment.
Definition: var_opt_sketch_impl.hpp:217
bool is_empty() const
Returns true if the sketch is empty.
Definition: var_opt_sketch_impl.hpp:643
var_opt_sketch(uint32_t k, resize_factor rf=var_opt_constants::DEFAULT_RESIZE_FACTOR, const A &allocator=A())
Constructor.
Definition: var_opt_sketch_impl.hpp:46
subset_summary estimate_subset_sum(P predicate) const
Computes an estimated subset sum from the entire stream for objects matching a given predicate.
Definition: var_opt_sketch_impl.hpp:1381
static var_opt_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given array of bytes.
uint32_t get_num_samples() const
Returns the number of samples currently in the sketch.
Definition: var_opt_sketch_impl.hpp:703
void reset()
Resets the sketch to its default, empty state.
Definition: var_opt_sketch_impl.hpp:648
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition: var_opt_sketch_impl.hpp:297
const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition: var_opt_sketch_impl.hpp:1491
static var_opt_sketch deserialize(std::istream &is, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given stream.
string< A > to_string() const
Prints a summary of the sketch.
Definition: var_opt_sketch_impl.hpp:719
uint32_t get_k() const
Returns the configured maximum sample size.
Definition: var_opt_sketch_impl.hpp:698
uint64_t get_n() const
Returns the length of the input stream.
Definition: var_opt_sketch_impl.hpp:693
Provides a unioning operation over var_opt_sketch objects.
Definition: var_opt_union.hpp:52
const resize_factor DEFAULT_RESIZE_FACTOR
default resize factor
Definition: var_opt_sketch.hpp:49
const uint32_t MAX_K
maximum value of parameter K
Definition: var_opt_sketch.hpp:51
DataSketches namespace.
Definition: binomial_bounds.hpp:38