datasketches-cpp
Loading...
Searching...
No Matches
var_opt_sketch.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef _VAR_OPT_SKETCH_HPP_
21#define _VAR_OPT_SKETCH_HPP_
22
23#include "serde.hpp"
24#include "common_defs.hpp"
25
26#include <iterator>
27#include <vector>
28
29namespace datasketches {
30
31template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
32template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
33
34/*
35 * A struct to hold the result of subset sum queries
36 */
37struct subset_summary {
38 double lower_bound;
39 double estimate;
40 double upper_bound;
41 double total_sketch_weight;
42};
43
44template <typename T, typename A> class var_opt_union; // forward declaration
45
47namespace var_opt_constants {
49 const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
51 const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
52}
53
63template<
64 typename T,
65 typename A = std::allocator<T>
66>
68
69 public:
70 static const resize_factor DEFAULT_RESIZE_FACTOR = var_opt_constants::DEFAULT_RESIZE_FACTOR;
71 static const uint32_t MAX_K = var_opt_constants::MAX_K;
72
79 explicit var_opt_sketch(uint32_t k,
81 const A& allocator = A());
82
87 var_opt_sketch(const var_opt_sketch& other);
88
93 var_opt_sketch(var_opt_sketch&& other) noexcept;
94
96
103
110
117 void update(const T& item, double weight = 1.0);
118
125 void update(T&& item, double weight = 1.0);
126
131 inline uint32_t get_k() const;
132
137 inline uint64_t get_n() const;
138
143 inline uint32_t get_num_samples() const;
144
153 template<typename P>
154 subset_summary estimate_subset_sum(P predicate) const;
155
160 inline bool is_empty() const;
161
165 void reset();
166
173 template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
174 inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
175
182 template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
183 inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
184
185 // This is a convenience alias for users
186 // The type returned by the following serialize method
187 using vector_bytes = vector_u8<A>;
188
197 template<typename SerDe = serde<T>>
198 vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
199
205 template<typename SerDe = serde<T>>
206 void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
207
215 template<typename SerDe = serde<T>>
216 static var_opt_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
217
226 template<typename SerDe = serde<T>>
227 static var_opt_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
228
233 string<A> to_string() const;
234
243 string<A> items_to_string() const;
244
245 class const_iterator;
246
252 const_iterator begin() const;
253
260 const_iterator end() const;
261
262 private:
263 typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
264 typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
265
266 static const uint32_t MIN_LG_ARR_ITEMS = 3;
267
268 static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
269 static const uint8_t PREAMBLE_LONGS_WARMUP = 3;
270 static const uint8_t PREAMBLE_LONGS_FULL = 4;
271 static const uint8_t SER_VER = 2;
272 static const uint8_t FAMILY_ID = 13;
273 static const uint8_t EMPTY_FLAG_MASK = 4;
274 static const uint8_t GADGET_FLAG_MASK = 128;
275
276 // Number of standard deviations to use for subset sum error bounds
277 constexpr static const double DEFAULT_KAPPA = 2.0;
278
279 // TODO: should probably rearrange a bit to minimize gaps once aligned
280 uint32_t k_; // max size of sketch, in items
281
282 uint32_t h_; // number of items in heap
283 uint32_t m_; // number of items in middle region
284 uint32_t r_; // number of items in reservoir-like region
285
286 uint64_t n_; // total number of items processed by sketch
287 double total_wt_r_; // total weight of items in reservoir-like area
288
289 resize_factor rf_; // resize factor
290
291 uint32_t curr_items_alloc_; // currently allocated array size
292 bool filled_data_; // true if we've explicitly set all entries in data_
293
294 A allocator_;
295 T* data_; // stored sampled items
296 double* weights_; // weights for sampled items
297
298 // The next two fields are hidden from the user because they are part of the state of the
299 // unioning algorithm, NOT part of a varopt sketch, or even of a varopt "gadget" (our name for
300 // the potentially invalid sketch that is maintained by the unioning algorithm). It would make
301 // more sense logically for these fields to be declared in the unioning object (whose entire
302 // purpose is storing the state of the unioning algorithm) but for reasons of programming
303 // convenience we are currently declaring them here. However, that could change in the future.
304
305 // Following int is:
306 // 1. Zero (for a varopt sketch)
307 // 2. Count of marked items in H region, if part of a unioning algo's gadget
308 uint32_t num_marks_in_h_;
309
310 // The following array is absent in a varopt sketch, and notionally present in a gadget
311 // (although it really belongs in the unioning object). If the array were to be made explicit,
312 // some additional coding would need to be done to ensure that all of the necessary data motion
313 // occurs and is properly tracked.
314 bool* marks_;
315
316 // used during deserialization to avoid memory leaks upon errors
317 class items_deleter;
318 class weights_deleter;
319 class marks_deleter;
320
321 var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget, const A& allocator);
322 var_opt_sketch(uint32_t k, uint32_t h, uint32_t m, uint32_t r, uint64_t n, double total_wt_r, resize_factor rf,
323 uint32_t curr_items_alloc, bool filled_data, std::unique_ptr<T, items_deleter> items,
324 std::unique_ptr<double, weights_deleter> weights, uint32_t num_marks_in_h,
325 std::unique_ptr<bool, marks_deleter> marks, const A& allocator);
326
327 friend class var_opt_union<T, A>;
328 var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n);
329
330 string<A> items_to_string(bool print_gap) const;
331
332 // internal-use-only update
333 template<typename O>
334 inline void update(O&& item, double weight, bool mark);
335
336 template<typename O>
337 inline void update_warmup_phase(O&& item, double weight, bool mark);
338
339 template<typename O>
340 inline void update_light(O&& item, double weight, bool mark);
341
342 template<typename O>
343 inline void update_heavy_r_eq1(O&& item, double weight, bool mark);
344
345 template<typename O>
346 inline void update_heavy_general(O&& item, double weight, bool mark);
347
348 inline double get_tau() const;
349 inline double peek_min() const;
350 inline bool is_marked(uint32_t idx) const;
351
352 inline uint32_t pick_random_slot_in_r() const;
353 inline uint32_t choose_delete_slot(double wt_cand, uint32_t num_cand) const;
354 inline uint32_t choose_weighted_delete_slot(double wt_cand, uint32_t num_cand) const;
355
356 template<typename O>
357 inline void push(O&& item, double wt, bool mark);
358 inline void transition_from_warmup();
359 inline void convert_to_heap();
360 inline void restore_towards_leaves(uint32_t slot_in);
361 inline void restore_towards_root(uint32_t slot_in);
362 inline void pop_min_to_m_region();
363 void grow_candidate_set(double wt_cands, uint32_t num_cands);
364 void decrease_k_by_1();
365 void strip_marks();
366 void force_set_k(uint32_t k); // used to resolve union gadget into sketch
367 void downsample_candidate_set(double wt_cands, uint32_t num_cands);
368 inline void swap_values(uint32_t src, uint32_t dst);
369 void grow_data_arrays();
370 void allocate_data_arrays(uint32_t tgt_size, bool use_marks);
371
372 // validation
373 static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
374 static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
375 static uint32_t validate_and_get_target_size(uint32_t preamble_longs, uint32_t k, uint64_t n,
376 uint32_t h, uint32_t r, resize_factor rf);
377
378 // things to move to common and be shared among sketches
379 static uint32_t get_adjusted_size(uint32_t max_size, uint32_t resize_target);
380 static uint32_t starting_sub_multiple(uint32_t lg_target, uint32_t lg_rf, uint32_t lg_min);
381 static inline double pseudo_hypergeometric_ub_on_p(uint64_t n, uint32_t k, double sampling_rate);
382 static inline double pseudo_hypergeometric_lb_on_p(uint64_t n, uint32_t k, double sampling_rate);
383 static bool is_power_of_2(uint32_t v);
384 static uint32_t to_log_2(uint32_t v);
385 static inline uint32_t next_int(uint32_t max_value);
386 static inline double next_double_exclude_zero();
387
388 class iterator;
389};
390
391template<typename T, typename A>
392class var_opt_sketch<T, A>::const_iterator {
393public:
394 using iterator_category = std::input_iterator_tag;
395 using value_type = std::pair<const T&, const double>;
396 using difference_type = void;
397 using pointer = const return_value_holder<value_type>;
398 using reference = const value_type;
399
400 const_iterator(const const_iterator& other);
401 const_iterator& operator++();
402 const_iterator& operator++(int);
403 bool operator==(const const_iterator& other) const;
404 bool operator!=(const const_iterator& other) const;
405 reference operator*() const;
406 pointer operator->() const;
407
408private:
409 friend class var_opt_sketch<T, A>;
410 friend class var_opt_union<T, A>;
411
412 // default iterator over full sketch
413 const_iterator(const var_opt_sketch<T, A>& sk, bool is_end);
414
415 // iterates over only one of the H or R regions
416 // does not apply weight correction
417 const_iterator(const var_opt_sketch<T, A>& sk, bool is_end, bool use_r_region);
418
419 bool get_mark() const;
420
421 const var_opt_sketch<T, A>* sk_;
422 double cum_r_weight_; // used for weight correction
423 double r_item_wt_;
424 size_t idx_;
425 const size_t final_idx_;
426};
427
428// non-const iterator for internal use
429template<typename T, typename A>
430class var_opt_sketch<T, A>::iterator {
431public:
432 using iterator_category = std::input_iterator_tag;
433 using value_type = std::pair<T&, double>;
434 using difference_type = void;
435 using pointer = return_value_holder<value_type>;
436 using reference = value_type;
437
438 iterator(const iterator& other);
439 iterator& operator++();
440 iterator& operator++(int);
441 bool operator==(const iterator& other) const;
442 bool operator!=(const iterator& other) const;
443 reference operator*();
444 pointer operator->();
445
446private:
447 friend class var_opt_sketch<T, A>;
448 friend class var_opt_union<T, A>;
449
450 // iterates over only one of the H or R region, applying weight correction
451 // if iterating over R region (can correct for numerical precision issues)
452 iterator(const var_opt_sketch<T, A>& sk, bool is_end, bool use_r_region);
453
454 bool get_mark() const;
455
456 const var_opt_sketch<T, A>* sk_;
457 double cum_r_weight_; // used for weight correction
458 double r_item_wt_;
459 size_t idx_;
460 const size_t final_idx_;
461};
462
463
464} // namespace datasketches
465
466#include "var_opt_sketch_impl.hpp"
467
468#endif // _VAR_OPT_SKETCH_HPP_
This sketch samples data from a stream of items.
Definition var_opt_sketch.hpp:67
string< A > items_to_string() const
Prints the raw sketch items to a string.
Definition var_opt_sketch_impl.hpp:735
const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition var_opt_sketch_impl.hpp:1486
void update(const T &item, double weight=1.0)
Updates this sketch with the given data item with the given weight.
Definition var_opt_sketch_impl.hpp:709
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.
var_opt_sketch & operator=(const var_opt_sketch &other)
Copy assignment.
Definition var_opt_sketch_impl.hpp:217
bool is_empty() const
Returns true if the sketch is empty.
Definition var_opt_sketch_impl.hpp:643
subset_summary estimate_subset_sum(P predicate) const
Computes an estimated subset sum from the entire stream for objects matching a given predicate.
Definition var_opt_sketch_impl.hpp:1381
static var_opt_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given array of bytes.
uint32_t get_num_samples() const
Returns the number of samples currently in the sketch.
Definition var_opt_sketch_impl.hpp:703
void reset()
Resets the sketch to its default, empty state.
Definition var_opt_sketch_impl.hpp:648
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition var_opt_sketch_impl.hpp:297
const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition var_opt_sketch_impl.hpp:1491
static var_opt_sketch deserialize(std::istream &is, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given stream.
string< A > to_string() const
Prints a summary of the sketch.
Definition var_opt_sketch_impl.hpp:719
uint32_t get_k() const
Returns the configured maximum sample size.
Definition var_opt_sketch_impl.hpp:698
uint64_t get_n() const
Returns the length of the input stream.
Definition var_opt_sketch_impl.hpp:693
Provides a unioning operation over var_opt_sketch objects.
Definition var_opt_union.hpp:52
const resize_factor DEFAULT_RESIZE_FACTOR
default resize factor
Definition var_opt_sketch.hpp:49
const uint32_t MAX_K
maximum value of parameter K
Definition var_opt_sketch.hpp:51
DataSketches namespace.
Definition binomial_bounds.hpp:38