datasketches-cpp
kll_sketch.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef KLL_SKETCH_HPP_
21 #define KLL_SKETCH_HPP_
22 
23 #include <memory>
24 #include <vector>
25 
26 #include "common_defs.hpp"
27 #include "serde.hpp"
28 #include "quantiles_sorted_view.hpp"
29 #include "optional.hpp"
30 
31 namespace datasketches {
32 
34 namespace kll_constants {
36  const uint16_t DEFAULT_K = 200;
37  const uint8_t DEFAULT_M = 8;
39  const uint16_t MIN_K = DEFAULT_M;
41  const uint16_t MAX_K = (1 << 16) - 1;
42 }
43 
161 template <
162  typename T,
163  typename C = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
164  typename A = std::allocator<T>
165 >
166 class kll_sketch {
167  public:
168  using value_type = T;
169  using comparator = C;
170  using allocator_type = A;
171  using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
172  using vector_double = typename quantiles_sorted_view<T, C, A>::vector_double;
173 
179 
186  explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const C& comparator = C(), const A& allocator = A());
187 
192  kll_sketch(const kll_sketch& other);
193 
198  kll_sketch(kll_sketch&& other) noexcept;
199 
200 
201  ~kll_sketch();
202 
208  kll_sketch& operator=(const kll_sketch& other);
209 
215  kll_sketch& operator=(kll_sketch&& other);
216 
217  /*
218  * Type converting constructor.
219  * @param other sketch of a different type
220  * @param comparator instance of a Comparator
221  * @param allocator instance of an Allocator
222  */
223  template<typename TT, typename CC, typename AA>
224  explicit kll_sketch(const kll_sketch<TT, CC, AA>& other, const C& comparator = C(), const A& allocator = A());
225 
230  template<typename FwdT>
231  void update(FwdT&& item);
232 
237  template<typename FwdSk>
238  void merge(FwdSk&& other);
239 
244  bool is_empty() const;
245 
250  uint16_t get_k() const;
251 
256  uint64_t get_n() const;
257 
262  uint32_t get_num_retained() const;
263 
268  bool is_estimation_mode() const;
269 
275  T get_min_item() const;
276 
282  T get_max_item() const;
283 
288  C get_comparator() const;
289 
294  A get_allocator() const;
295 
307  quantile_return_type get_quantile(double rank, bool inclusive = true) const;
308 
324  double get_rank(const T& item, bool inclusive = true) const;
325 
348  vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
349 
375  vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
376 
384  double get_normalized_rank_error(bool pmf) const;
385 
392  template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
393  size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
394 
401  template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
402  size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
403 
414  template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
415  static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n);
416 
428  template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
429  static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes);
430 
436  template<typename SerDe = serde<T>>
437  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
438 
439  // This is a convenience alias for users
440  // The type returned by the following serialize method
441  using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
442 
452  template<typename SerDe = serde<T>>
453  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
454 
463  template<typename SerDe = serde<T>>
464  static kll_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
465  const C& comparator = C(), const A& allocator = A());
466 
476  template<typename SerDe = serde<T>>
477  static kll_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
478  const C& comparator = C(), const A& allocator = A());
479 
480  /*
481  * Gets the normalized rank error given k and pmf.
482  * k - the configuration parameter
483  * pmf - if true, returns the "double-sided" normalized rank error for the get_PMF() function.
484  * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
485  * Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials
486  */
487  static double get_normalized_rank_error(uint16_t k, bool pmf);
488 
494  string<A> to_string(bool print_levels = false, bool print_items = false) const;
495 
496  class const_iterator;
497 
503  const_iterator begin() const;
504 
511  const_iterator end() const;
512 
518 
519  private:
520  /* Serialized sketch layout:
521  * Addr:
522  * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
523  * 0 || unused | M |--------K--------| Flags | FamID | SerVer | PreambleInts |
524  * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
525  * 1 ||-----------------------------------N------------------------------------------|
526  * || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 |
527  * 2 ||---------------data----------------|-unused-|numLevels|-------min K-----------|
528  */
529 
530  static const size_t EMPTY_SIZE_BYTES = 8;
531  static const size_t DATA_START_SINGLE_ITEM = 8;
532  static const size_t DATA_START = 20;
533 
534  static const uint8_t SERIAL_VERSION_1 = 1;
535  static const uint8_t SERIAL_VERSION_2 = 2;
536  static const uint8_t FAMILY = 15;
537 
538  enum flags { IS_EMPTY, IS_LEVEL_ZERO_SORTED, IS_SINGLE_ITEM };
539 
540  static const uint8_t PREAMBLE_INTS_SHORT = 2; // for empty and single item
541  static const uint8_t PREAMBLE_INTS_FULL = 5;
542 
543  C comparator_;
544  A allocator_;
545  uint16_t k_;
546  uint8_t m_; // minimum buffer "width"
547  uint16_t min_k_; // for error estimation after merging with different k
548  uint8_t num_levels_;
549  bool is_level_zero_sorted_;
550  uint64_t n_;
551  vector_u32 levels_;
552  T* items_;
553  uint32_t items_size_;
554  optional<T> min_item_;
555  optional<T> max_item_;
556  mutable quantiles_sorted_view<T, C, A>* sorted_view_;
557 
558  // for deserialization
559  class items_deleter;
560  kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32&& levels,
561  std::unique_ptr<T, items_deleter> items, uint32_t items_size, optional<T>&& min_item,
562  optional<T>&& max_item, bool is_level_zero_sorted, const C& comparator);
563 
564  // common update code
565  inline void update_min_max(const T& item);
566  inline uint32_t internal_update();
567 
568  // The following code is only valid in the special case of exactly reaching capacity while updating.
569  // It cannot be used while merging, while reducing k, or anything else.
570  void compress_while_updating(void);
571 
572  uint8_t find_level_to_compact() const;
573  void add_empty_top_level_to_completely_full_sketch();
574  void sort_level_zero();
575 
576  template<typename O> void merge_higher_levels(O&& other, uint64_t final_n);
577 
578  template<typename FwdSk>
579  void populate_work_arrays(FwdSk&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels);
580 
581  void assert_correct_total_weight() const;
582  uint32_t safe_level_size(uint8_t level) const;
583  uint32_t get_num_retained_above_level_zero() const;
584 
585  static void check_m(uint8_t m);
586  static void check_preamble_ints(uint8_t preamble_ints, uint8_t flags_byte);
587  static void check_serial_version(uint8_t serial_version);
588  static void check_family_id(uint8_t family_id);
589 
590  void check_sorting() const;
591 
592  template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
593  static inline bool check_update_item(TT item) {
594  return !std::isnan(item);
595  }
596 
597  template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
598  static inline bool check_update_item(TT) {
599  return true;
600  }
601 
602  // for type converting constructor
603  template<typename TT, typename CC, typename AA> friend class kll_sketch;
604 
605  void setup_sorted_view() const; // modifies mutable state
606  void reset_sorted_view();
607 };
608 
609 template<typename T, typename C, typename A>
610 class kll_sketch<T, C, A>::const_iterator {
611 public:
612  using iterator_category = std::input_iterator_tag;
613  using value_type = std::pair<const T&, const uint64_t>;
614  using difference_type = void;
615  using pointer = const return_value_holder<value_type>;
616  using reference = const value_type;
617 
618  friend class kll_sketch<T, C, A>;
619  const_iterator& operator++();
620  const_iterator& operator++(int);
621  bool operator==(const const_iterator& other) const;
622  bool operator!=(const const_iterator& other) const;
623  reference operator*() const;
624  pointer operator->() const;
625 private:
626  const T* items;
627  const uint32_t* levels;
628  const uint8_t num_levels;
629  uint32_t index;
630  uint8_t level;
631  uint64_t weight;
632  const_iterator(const T* items, const uint32_t* levels, const uint8_t num_levels);
633 };
634 
635 } /* namespace datasketches */
636 
637 #include "kll_sketch_impl.hpp"
638 
639 #endif
Implementation of a very compact quantiles sketch with lazy compaction scheme and nearly optimal accu...
Definition: kll_sketch.hpp:166
C get_comparator() const
Returns an instance of the comparator for this sketch.
Definition: kll_sketch_impl.hpp:271
quantiles_sorted_view< T, C, A > get_sorted_view() const
Gets the sorted view of this sketch.
Definition: kll_sketch_impl.hpp:760
kll_sketch & operator=(const kll_sketch &other)
Copy assignment.
Definition: kll_sketch_impl.hpp:102
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.
vector_double get_CDF(const T *split_points, uint32_t size, bool inclusive=true) const
Returns an approximation to the Cumulative Distribution Function (CDF), which is the cumulative analo...
Definition: kll_sketch_impl.hpp:295
const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition: kll_sketch_impl.hpp:956
uint32_t get_num_retained() const
Returns the number of retained items (samples) in the sketch.
Definition: kll_sketch_impl.hpp:249
static kll_sketch deserialize(std::istream &is, const SerDe &sd=SerDe(), const C &comparator=C(), const A &allocator=A())
This method deserializes a sketch from a given stream.
vector_double get_PMF(const T *split_points, uint32_t size, bool inclusive=true) const
Returns an approximation to the Probability Mass Function (PMF) of the input stream given a set of sp...
Definition: kll_sketch_impl.hpp:288
void merge(FwdSk &&other)
Merges another sketch into this one.
Definition: kll_sketch_impl.hpp:209
T get_max_item() const
Returns the max item of the stream.
Definition: kll_sketch_impl.hpp:265
const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition: kll_sketch_impl.hpp:961
void update(FwdT &&item)
Updates this sketch with the given data item.
Definition: kll_sketch_impl.hpp:180
void serialize(std::ostream &os, const SerDe &sd=SerDe()) const
This method serializes the sketch into a given stream in a binary form.
Definition: kll_sketch_impl.hpp:367
string< A > to_string(bool print_levels=false, bool print_items=false) const
Prints a summary of the sketch.
Definition: kll_sketch_impl.hpp:904
uint16_t get_k() const
Returns configured parameter k.
Definition: kll_sketch_impl.hpp:239
bool is_empty() const
Returns true if this sketch is empty.
Definition: kll_sketch_impl.hpp:234
double get_rank(const T &item, bool inclusive=true) const
Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
Definition: kll_sketch_impl.hpp:281
A get_allocator() const
Returns an instance of the allocator for this sketch.
Definition: kll_sketch_impl.hpp:276
T get_min_item() const
Returns the min item of the stream.
Definition: kll_sketch_impl.hpp:259
typename quantiles_sorted_view< T, C, A >::quantile_return_type quantile_return_type
Quantile return type.
Definition: kll_sketch.hpp:178
quantile_return_type get_quantile(double rank, bool inclusive=true) const
Returns an item from the sketch that is the best approximation to an item from the original stream wi...
Definition: kll_sketch_impl.hpp:302
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition: kll_sketch_impl.hpp:320
static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n)
Returns upper bound on the serialized size of a sketch given a parameter k and stream length.
Definition: kll_sketch_impl.hpp:348
double get_normalized_rank_error(bool pmf) const
Gets the approximate rank error of this sketch normalized as a fraction between zero and one.
Definition: kll_sketch_impl.hpp:313
bool is_estimation_mode() const
Returns true if this sketch is in estimation mode.
Definition: kll_sketch_impl.hpp:254
uint64_t get_n() const
Returns the length of the input stream.
Definition: kll_sketch_impl.hpp:244
static kll_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const C &comparator=C(), const A &allocator=A())
This method deserializes a sketch from a given array of bytes.
Sorted view for quantiles sketches (REQ, KLL and Quantiles)
Definition: quantiles_sorted_view.hpp:38
const uint16_t MAX_K
max value of parameter K
Definition: kll_sketch.hpp:41
const uint16_t MIN_K
min value of parameter K
Definition: kll_sketch.hpp:39
const uint16_t DEFAULT_K
default value of parameter K
Definition: kll_sketch.hpp:36
DataSketches namespace.
Definition: binomial_bounds.hpp:38