datasketches-cpp
quantiles_sketch.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef _QUANTILES_SKETCH_HPP_
21 #define _QUANTILES_SKETCH_HPP_
22 
23 #include <functional>
24 #include <memory>
25 #include <vector>
26 
27 #include "quantiles_sorted_view.hpp"
28 #include "common_defs.hpp"
29 #include "serde.hpp"
30 #include "optional.hpp"
31 
32 namespace datasketches {
33 
35 namespace quantiles_constants {
37  const uint16_t DEFAULT_K = 128;
39  const uint16_t MIN_K = 2;
41  const uint16_t MAX_K = 1 << 15;
42 }
43 
150 template <typename T,
151  typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
152  typename Allocator = std::allocator<T>>
154 public:
155  using value_type = T;
156  using allocator_type = Allocator;
157  using comparator = Comparator;
158  using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type;
159  using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double;
160 
168  const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
169 
174  quantiles_sketch(const quantiles_sketch& other);
175 
179  quantiles_sketch(quantiles_sketch&& other) noexcept;
180 
181  ~quantiles_sketch();
182 
189 
195  quantiles_sketch& operator=(quantiles_sketch&& other) noexcept;
196 
203  template<typename From, typename FC, typename FA>
205  const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
206 
211  template<typename FwdT>
212  void update(FwdT&& item);
213 
218  template<typename FwdSk>
219  void merge(FwdSk&& other);
220 
225  bool is_empty() const;
226 
231  uint16_t get_k() const;
232 
237  uint64_t get_n() const;
238 
243  uint32_t get_num_retained() const;
244 
249  bool is_estimation_mode() const;
250 
256  const T& get_min_item() const;
257 
263  const T& get_max_item() const;
264 
269  Comparator get_comparator() const;
270 
275  allocator_type get_allocator() const;
276 
289  quantile_return_type get_quantile(double rank, bool inclusive = true) const;
290 
305  double get_rank(const T& item, bool inclusive = true) const;
306 
329  vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
330 
356  vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
357 
364  template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
365  size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
366 
373  template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
374  size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
375 
381  template<typename SerDe = serde<T>>
382  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
383 
384  // This is a convenience alias for users
385  // The type returned by the following serialize method
386  using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
387 
397  template<typename SerDe = serde<T>>
398  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
399 
408  template<typename SerDe = serde<T>>
409  static quantiles_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
410  const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
411 
421  template<typename SerDe = serde<T>>
422  static quantiles_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
423  const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
424 
432  double get_normalized_rank_error(bool is_pmf) const;
433 
442  static double get_normalized_rank_error(uint16_t k, bool is_pmf);
443 
449  string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
450 
451  class const_iterator;
452 
458  const_iterator begin() const;
459 
466  const_iterator end() const;
467 
473 
474 private:
475  using Level = std::vector<T, Allocator>;
476  using VectorLevels = std::vector<Level, typename std::allocator_traits<Allocator>::template rebind_alloc<Level>>;
477 
478  /* Serialized sketch layout:
479  * Long || Start Byte Addr:
480  * Addr:
481  * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
482  * 0 || Preamble_Longs | SerVer | FamID | Flags |----- K ---------|---- unused -----|
483  *
484  * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
485  * 1 ||---------------------------Items Seen Count (N)--------------------------------|
486  *
487  * Long 3 is the start of data, beginning with serialized min and max item, followed by
488  * the sketch data buffers.
489  */
490 
491  static const size_t EMPTY_SIZE_BYTES = 8;
492  static const uint8_t SERIAL_VERSION_1 = 1;
493  static const uint8_t SERIAL_VERSION_2 = 2;
494  static const uint8_t SERIAL_VERSION = 3;
495  static const uint8_t FAMILY = 8;
496 
497  enum flags { RESERVED0, RESERVED1, IS_EMPTY, IS_COMPACT, IS_SORTED };
498 
499  static const uint8_t PREAMBLE_LONGS_SHORT = 1; // for empty
500  static const uint8_t PREAMBLE_LONGS_FULL = 2;
501  static const size_t DATA_START = 16;
502 
503  Comparator comparator_;
504  Allocator allocator_;
505  bool is_base_buffer_sorted_;
506  uint16_t k_;
507  uint64_t n_;
508  uint64_t bit_pattern_;
509  Level base_buffer_;
510  VectorLevels levels_;
511  optional<T> min_item_;
512  optional<T> max_item_;
514 
515  void setup_sorted_view() const; // modifies mutable state
516  void reset_sorted_view();
517 
518  // for deserialization
519  class items_deleter;
520  quantiles_sketch(uint16_t k, uint64_t n, uint64_t bit_pattern,
521  Level&& base_buffer, VectorLevels&& levels,
522  optional<T>&& min_item, optional<T>&& max_item,
523  bool is_sorted, const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
524 
525  void grow_base_buffer();
526  void process_full_base_buffer();
527 
528  // returns true if size adjusted, else false
529  bool grow_levels_if_needed();
530 
531  // buffers should be pre-sized to target capacity as appropriate
532  template<typename FwdV>
533  static void in_place_propagate_carry(uint8_t starting_level, FwdV&& buf_size_k,
534  Level& buf_size_2k, bool apply_as_update,
535  quantiles_sketch& sketch);
536  static void zip_buffer(Level& buf_in, Level& buf_out);
537  static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out, const Comparator& comparator);
538 
539  template<typename SerDe>
540  static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator);
541 
542  template<typename SerDe>
543  static std::pair<Level, size_t> deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator);
544 
545  static void check_k(uint16_t k);
546  static void check_serial_version(uint8_t serial_version);
547  static void check_header_validity(uint8_t preamble_longs, uint8_t flags_byte, uint8_t serial_version);
548  static void check_family_id(uint8_t family_id);
549 
550  static uint32_t compute_retained_items(uint16_t k, uint64_t n);
551  static uint32_t compute_base_buffer_items(uint16_t k, uint64_t n);
552  static uint64_t compute_bit_pattern(uint16_t k, uint64_t n);
553  static uint32_t count_valid_levels(uint64_t bit_pattern);
554  static uint8_t compute_levels_needed(uint16_t k, uint64_t n);
555 
560  template<typename FwdSk>
561  static void standard_merge(quantiles_sketch& tgt, FwdSk&& src);
562 
569  template<typename FwdSk>
570  static void downsampling_merge(quantiles_sketch& tgt, FwdSk&& src);
571 
572  template<typename FwdV>
573  static void zip_buffer_with_stride(FwdV&& buf_in, Level& buf_out, uint16_t stride);
574 
582  static uint8_t lowest_zero_bit_starting_at(uint64_t bits, uint8_t starting_bit);
583 
584  template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
585  static inline bool check_update_item(TT item) {
586  return !std::isnan(item);
587  }
588 
589  template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
590  static inline bool check_update_item(TT) {
591  return true;
592  }
593 
594  // for type converting constructor
595  template<typename From, typename FC, typename FA> friend class quantiles_sketch;
596 };
597 
598 
599 template<typename T, typename C, typename A>
600 class quantiles_sketch<T, C, A>::const_iterator {
601 public:
602  using iterator_category = std::input_iterator_tag;
603  using value_type = std::pair<const T&, const uint64_t>;
604  using difference_type = void;
605  using pointer = const return_value_holder<value_type>;
606  using reference = const value_type;
607 
608  const_iterator& operator++();
609  const_iterator& operator++(int);
610  bool operator==(const const_iterator& other) const;
611  bool operator!=(const const_iterator& other) const;
612  reference operator*() const;
613  pointer operator->() const;
614 private:
615  friend class quantiles_sketch<T, C, A>;
616  using Level = std::vector<T, A>;
617  using AllocLevel = typename std::allocator_traits<A>::template rebind_alloc<Level>;
618  Level base_buffer_;
619  std::vector<Level, AllocLevel> levels_;
620  int level_;
621  uint32_t index_;
622  uint32_t bb_count_;
623  uint64_t bit_pattern_;
624  uint64_t weight_;
625  uint16_t k_;
626  const_iterator(const Level& base_buffer, const std::vector<Level, AllocLevel>& levels, uint16_t k, uint64_t n, bool is_end);
627 };
628 
629 } /* namespace datasketches */
630 
631 #include "quantiles_sketch_impl.hpp"
632 
633 #endif // _QUANTILES_SKETCH_HPP_
This is a stochastic streaming sketch that enables near-real time analysis of the approximate distrib...
Definition: quantiles_sketch.hpp:153
Comparator get_comparator() const
Returns an instance of the comparator for this sketch.
Definition: quantiles_sketch_impl.hpp:681
quantiles_sorted_view< T, Comparator, Allocator > get_sorted_view() const
Gets the sorted view of this sketch.
Definition: quantiles_sketch_impl.hpp:723
quantiles_sketch(uint16_t k=quantiles_constants::DEFAULT_K, const Comparator &comparator=Comparator(), const Allocator &allocator=Allocator())
Constructor.
quantiles_sketch & operator=(const quantiles_sketch &other)
Copy assignment.
Definition: quantiles_sketch_impl.hpp:89
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.
vector_double get_CDF(const T *split_points, uint32_t size, bool inclusive=true) const
Returns an approximation to the Cumulative Distribution Function (CDF), which is the cumulative analo...
Definition: quantiles_sketch_impl.hpp:769
uint32_t get_num_retained() const
Returns the number of retained items (samples) in the sketch.
Definition: quantiles_sketch_impl.hpp:664
double get_normalized_rank_error(bool is_pmf) const
Gets the normalized rank error for this sketch.
Definition: quantiles_sketch_impl.hpp:711
vector_double get_PMF(const T *split_points, uint32_t size, bool inclusive=true) const
Returns an approximation to the Probability Mass Function (PMF) of the input stream given a set of sp...
Definition: quantiles_sketch_impl.hpp:762
void merge(FwdSk &&other)
Merges another sketch into this one.
Definition: quantiles_sketch_impl.hpp:235
void update(FwdT &&item)
Updates this sketch with the given data item.
Definition: quantiles_sketch_impl.hpp:212
void serialize(std::ostream &os, const SerDe &sd=SerDe()) const
This method serializes the sketch into a given stream in a binary form.
Definition: quantiles_sketch_impl.hpp:277
string< Allocator > to_string(bool print_levels=false, bool print_items=false) const
Prints a summary of the sketch.
Definition: quantiles_sketch_impl.hpp:595
uint16_t get_k() const
Returns configured parameter k.
Definition: quantiles_sketch_impl.hpp:644
bool is_empty() const
Returns true if this sketch is empty.
Definition: quantiles_sketch_impl.hpp:654
const T & get_max_item() const
Returns the max item of the stream.
Definition: quantiles_sketch_impl.hpp:675
static quantiles_sketch deserialize(std::istream &is, const SerDe &sd=SerDe(), const Comparator &comparator=Comparator(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
double get_rank(const T &item, bool inclusive=true) const
Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
Definition: quantiles_sketch_impl.hpp:755
static quantiles_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const Comparator &comparator=Comparator(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given array of bytes.
allocator_type get_allocator() const
Returns the allocator for this sketch.
Definition: quantiles_sketch_impl.hpp:686
quantile_return_type get_quantile(double rank, bool inclusive=true) const
Returns an approximation to the data item associated with the given rank of a hypothetical sorted ver...
Definition: quantiles_sketch_impl.hpp:744
const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition: quantiles_sketch_impl.hpp:866
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition: quantiles_sketch_impl.hpp:693
bool is_estimation_mode() const
Returns true if this sketch is in estimation mode.
Definition: quantiles_sketch_impl.hpp:659
quantiles_sketch(const quantiles_sketch< From, FC, FA > &other, const Comparator &comparator=Comparator(), const Allocator &allocator=Allocator())
Type converting constructor.
const T & get_min_item() const
Returns the min item of the stream.
Definition: quantiles_sketch_impl.hpp:669
const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition: quantiles_sketch_impl.hpp:871
uint64_t get_n() const
Returns the length of the input stream.
Definition: quantiles_sketch_impl.hpp:649
Sorted view for quantiles sketches (REQ, KLL and Quantiles)
Definition: quantiles_sorted_view.hpp:38
typename std::conditional< std::is_arithmetic< T >::value, T, const T & >::type quantile_return_type
Quantile return type.
Definition: quantiles_sorted_view.hpp:93
const uint16_t MAX_K
maximum value of parameter K
Definition: quantiles_sketch.hpp:41
const uint16_t MIN_K
minimum value of parameter K
Definition: quantiles_sketch.hpp:39
const uint16_t DEFAULT_K
default value of parameter K
Definition: quantiles_sketch.hpp:37
DataSketches namespace.
Definition: binomial_bounds.hpp:38
Interface for serializing and deserializing items.
Definition: serde.hpp:34