datasketches-cpp
Loading...
Searching...
No Matches
var_opt_sketch.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef _VAR_OPT_SKETCH_HPP_
21#define _VAR_OPT_SKETCH_HPP_
22
23#include "serde.hpp"
24#include "common_defs.hpp"
25
26#include <iterator>
27#include <vector>
28
29namespace datasketches {
30
31template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
32template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
33
34/*
35 * A struct to hold the result of subset sum queries
36 */
37struct subset_summary {
38 double lower_bound;
39 double estimate;
40 double upper_bound;
41 double total_sketch_weight;
42};
43
44template <typename T, typename A> class var_opt_union; // forward declaration
45
47namespace var_opt_constants {
49 const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
51 const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
52}
53
68template<
69 typename T,
70 typename A = std::allocator<T>
71>
73
74 public:
75 static const resize_factor DEFAULT_RESIZE_FACTOR = var_opt_constants::DEFAULT_RESIZE_FACTOR;
76 static const uint32_t MAX_K = var_opt_constants::MAX_K;
77
84 explicit var_opt_sketch(uint32_t k,
86 const A& allocator = A());
87
92 var_opt_sketch(const var_opt_sketch& other);
93
98 var_opt_sketch(var_opt_sketch&& other) noexcept;
99
101
108
115
124 void update(const T& item, double weight = 1.0);
125
134 void update(T&& item, double weight = 1.0);
135
140 inline uint32_t get_k() const;
141
146 inline uint64_t get_n() const;
147
152 inline uint32_t get_num_samples() const;
153
162 template<typename P>
163 subset_summary estimate_subset_sum(P predicate) const;
164
169 inline bool is_empty() const;
170
174 void reset();
175
182 template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
183 inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
184
191 template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
192 inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
193
194 // This is a convenience alias for users
195 // The type returned by the following serialize method
196 using vector_bytes = vector_u8<A>;
197
206 template<typename SerDe = serde<T>>
207 vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
208
214 template<typename SerDe = serde<T>>
215 void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
216
224 template<typename SerDe = serde<T>>
225 static var_opt_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
226
235 template<typename SerDe = serde<T>>
236 static var_opt_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
237
242 string<A> to_string() const;
243
252 string<A> items_to_string() const;
253
254 class const_iterator;
255
261 const_iterator begin() const;
262
269 const_iterator end() const;
270
271 private:
272 typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
273 typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
274
275 static const uint32_t MIN_LG_ARR_ITEMS = 4;
276
277 static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
278 static const uint8_t PREAMBLE_LONGS_WARMUP = 3;
279 static const uint8_t PREAMBLE_LONGS_FULL = 4;
280 static const uint8_t SER_VER = 2;
281 static const uint8_t FAMILY_ID = 13;
282 static const uint8_t EMPTY_FLAG_MASK = 4;
283 static const uint8_t GADGET_FLAG_MASK = 128;
284
285 // Number of standard deviations to use for subset sum error bounds
286 constexpr static const double DEFAULT_KAPPA = 2.0;
287
288 // TODO: should probably rearrange a bit to minimize gaps once aligned
289 uint32_t k_; // max size of sketch, in items
290
291 uint32_t h_; // number of items in heap
292 uint32_t m_; // number of items in middle region
293 uint32_t r_; // number of items in reservoir-like region
294
295 uint64_t n_; // total number of items processed by sketch
296 double total_wt_r_; // total weight of items in reservoir-like area
297
298 resize_factor rf_; // resize factor
299
300 uint32_t curr_items_alloc_; // currently allocated array size
301 bool filled_data_; // true if we've explicitly set all entries in data_
302
303 A allocator_;
304 T* data_; // stored sampled items
305 double* weights_; // weights for sampled items
306
307 // The next two fields are hidden from the user because they are part of the state of the
308 // unioning algorithm, NOT part of a varopt sketch, or even of a varopt "gadget" (our name for
309 // the potentially invalid sketch that is maintained by the unioning algorithm). It would make
310 // more sense logically for these fields to be declared in the unioning object (whose entire
311 // purpose is storing the state of the unioning algorithm) but for reasons of programming
312 // convenience we are currently declaring them here. However, that could change in the future.
313
314 // Following int is:
315 // 1. Zero (for a varopt sketch)
316 // 2. Count of marked items in H region, if part of a unioning algo's gadget
317 uint32_t num_marks_in_h_;
318
319 // The following array is absent in a varopt sketch, and notionally present in a gadget
320 // (although it really belongs in the unioning object). If the array were to be made explicit,
321 // some additional coding would need to be done to ensure that all of the necessary data motion
322 // occurs and is properly tracked.
323 bool* marks_;
324
325 // used during deserialization to avoid memory leaks upon errors
326 class items_deleter;
327 class weights_deleter;
328 class marks_deleter;
329
330 var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget, const A& allocator);
331 var_opt_sketch(uint32_t k, uint32_t h, uint32_t m, uint32_t r, uint64_t n, double total_wt_r, resize_factor rf,
332 uint32_t curr_items_alloc, bool filled_data, std::unique_ptr<T, items_deleter> items,
333 std::unique_ptr<double, weights_deleter> weights, uint32_t num_marks_in_h,
334 std::unique_ptr<bool, marks_deleter> marks, const A& allocator);
335
336 friend class var_opt_union<T, A>;
337 var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n);
338
339 string<A> items_to_string(bool print_gap) const;
340
341 // internal-use-only update
342 template<typename O>
343 inline void update(O&& item, double weight, bool mark);
344
345 template<typename O>
346 inline void update_warmup_phase(O&& item, double weight, bool mark);
347
348 template<typename O>
349 inline void update_light(O&& item, double weight, bool mark);
350
351 template<typename O>
352 inline void update_heavy_r_eq1(O&& item, double weight, bool mark);
353
354 template<typename O>
355 inline void update_heavy_general(O&& item, double weight, bool mark);
356
357 inline double get_tau() const;
358 inline double peek_min() const;
359 inline bool is_marked(uint32_t idx) const;
360
361 inline uint32_t pick_random_slot_in_r() const;
362 inline uint32_t choose_delete_slot(double wt_cand, uint32_t num_cand) const;
363 inline uint32_t choose_weighted_delete_slot(double wt_cand, uint32_t num_cand) const;
364
365 template<typename O>
366 inline void push(O&& item, double wt, bool mark);
367 inline void transition_from_warmup();
368 inline void convert_to_heap();
369 inline void restore_towards_leaves(uint32_t slot_in);
370 inline void restore_towards_root(uint32_t slot_in);
371 inline void pop_min_to_m_region();
372 void grow_candidate_set(double wt_cands, uint32_t num_cands);
373 void decrease_k_by_1();
374 void strip_marks();
375 void force_set_k(uint32_t k); // used to resolve union gadget into sketch
376 void downsample_candidate_set(double wt_cands, uint32_t num_cands);
377 inline void swap_values(uint32_t src, uint32_t dst);
378 void grow_data_arrays();
379 void allocate_data_arrays(uint32_t tgt_size, bool use_marks);
380
381 // validation
382 static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
383 static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
384 static uint32_t validate_and_get_target_size(uint32_t preamble_longs, uint32_t k, uint64_t n,
385 uint32_t h, uint32_t r, resize_factor rf);
386
387 // things to move to common and be shared among sketches
388 static uint32_t get_adjusted_size(uint32_t max_size, uint32_t resize_target);
389 static uint32_t starting_sub_multiple(uint32_t lg_target, uint32_t lg_rf, uint32_t lg_min);
390 static inline double pseudo_hypergeometric_ub_on_p(uint64_t n, uint32_t k, double sampling_rate);
391 static inline double pseudo_hypergeometric_lb_on_p(uint64_t n, uint32_t k, double sampling_rate);
392 static bool is_power_of_2(uint32_t v);
393 static uint32_t to_log_2(uint32_t v);
394 static inline uint32_t next_int(uint32_t max_value);
395 static inline double next_double_exclude_zero();
396
397 class iterator;
398};
399
400template<typename T, typename A>
401class var_opt_sketch<T, A>::const_iterator {
402public:
403 using iterator_category = std::input_iterator_tag;
404 using value_type = std::pair<const T&, const double>;
405 using difference_type = void;
406 using pointer = const return_value_holder<value_type>;
407 using reference = const value_type;
408
409 const_iterator(const const_iterator& other);
410 const_iterator& operator++();
411 const_iterator& operator++(int);
412 bool operator==(const const_iterator& other) const;
413 bool operator!=(const const_iterator& other) const;
414 reference operator*() const;
415 pointer operator->() const;
416
417private:
418 friend class var_opt_sketch<T, A>;
419 friend class var_opt_union<T, A>;
420
421 // default iterator over full sketch
422 const_iterator(const var_opt_sketch<T, A>& sk, bool is_end);
423
424 // iterates over only one of the H or R regions
425 // does not apply weight correction
426 const_iterator(const var_opt_sketch<T, A>& sk, bool is_end, bool use_r_region);
427
428 bool get_mark() const;
429
430 const var_opt_sketch<T, A>* sk_;
431 double cum_r_weight_; // used for weight correction
432 double r_item_wt_;
433 size_t idx_;
434 const size_t final_idx_;
435};
436
437// non-const iterator for internal use
438template<typename T, typename A>
439class var_opt_sketch<T, A>::iterator {
440public:
441 using iterator_category = std::input_iterator_tag;
442 using value_type = std::pair<T&, double>;
443 using difference_type = void;
444 using pointer = return_value_holder<value_type>;
445 using reference = value_type;
446
447 iterator(const iterator& other);
448 iterator& operator++();
449 iterator& operator++(int);
450 bool operator==(const iterator& other) const;
451 bool operator!=(const iterator& other) const;
452 reference operator*();
453 pointer operator->();
454
455private:
456 friend class var_opt_sketch<T, A>;
457 friend class var_opt_union<T, A>;
458
459 // iterates over only one of the H or R region, applying weight correction
460 // if iterating over R region (can correct for numerical precision issues)
461 iterator(const var_opt_sketch<T, A>& sk, bool is_end, bool use_r_region);
462
463 bool get_mark() const;
464
465 const var_opt_sketch<T, A>* sk_;
466 double cum_r_weight_; // used for weight correction
467 double r_item_wt_;
468 size_t idx_;
469 const size_t final_idx_;
470};
471
472
473} // namespace datasketches
474
475#include "var_opt_sketch_impl.hpp"
476
477#endif // _VAR_OPT_SKETCH_HPP_
This sketch samples data from a stream of items.
Definition var_opt_sketch.hpp:72
string< A > items_to_string() const
Prints the raw sketch items to a string.
Definition var_opt_sketch_impl.hpp:735
const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition var_opt_sketch_impl.hpp:1485
void update(const T &item, double weight=1.0)
Updates this sketch with the given data item with the given weight.
Definition var_opt_sketch_impl.hpp:709
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.
var_opt_sketch & operator=(const var_opt_sketch &other)
Copy assignment.
Definition var_opt_sketch_impl.hpp:217
bool is_empty() const
Returns true if the sketch is empty.
Definition var_opt_sketch_impl.hpp:643
subset_summary estimate_subset_sum(P predicate) const
Computes an estimated subset sum from the entire stream for objects matching a given predicate.
Definition var_opt_sketch_impl.hpp:1380
static var_opt_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given array of bytes.
uint32_t get_num_samples() const
Returns the number of samples currently in the sketch.
Definition var_opt_sketch_impl.hpp:703
void reset()
Resets the sketch to its default, empty state.
Definition var_opt_sketch_impl.hpp:648
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition var_opt_sketch_impl.hpp:297
const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition var_opt_sketch_impl.hpp:1490
static var_opt_sketch deserialize(std::istream &is, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given stream.
string< A > to_string() const
Prints a summary of the sketch.
Definition var_opt_sketch_impl.hpp:719
uint32_t get_k() const
Returns the configured maximum sample size.
Definition var_opt_sketch_impl.hpp:698
uint64_t get_n() const
Returns the length of the input stream.
Definition var_opt_sketch_impl.hpp:693
Provides a unioning operation over var_opt_sketch objects.
Definition var_opt_union.hpp:52
const resize_factor DEFAULT_RESIZE_FACTOR
default resize factor
Definition var_opt_sketch.hpp:49
const uint32_t MAX_K
maximum value of parameter K
Definition var_opt_sketch.hpp:51
DataSketches namespace.
Definition binomial_bounds.hpp:38