datasketches-cpp
req_sketch.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef REQ_SKETCH_HPP_
21 #define REQ_SKETCH_HPP_
22 
23 #include <iterator>
24 
25 #include "req_common.hpp"
26 #include "req_compactor.hpp"
27 #include "quantiles_sorted_view.hpp"
28 #include "optional.hpp"
29 
30 namespace datasketches {
31 
74 template<
75  typename T,
76  typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
77  typename Allocator = std::allocator<T>
78 >
79 class req_sketch {
80 public:
81  using value_type = T;
82  using comparator = Comparator;
83  using allocator_type = Allocator;
84  using Compactor = req_compactor<T, Comparator, Allocator>;
85  using AllocCompactor = typename std::allocator_traits<Allocator>::template rebind_alloc<Compactor>;
86  using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double;
87 
93 
103  explicit req_sketch(uint16_t k, bool hra = true, const Comparator& comparator = Comparator(),
104  const Allocator& allocator = Allocator());
105 
110  req_sketch(const req_sketch& other);
111 
116  req_sketch(req_sketch&& other) noexcept;
117 
118  ~req_sketch();
119 
125  req_sketch& operator=(const req_sketch& other);
126 
132  req_sketch& operator=(req_sketch&& other);
133 
140  template<typename TT, typename CC, typename AA>
141  explicit req_sketch(const req_sketch<TT, CC, AA>& other, const Comparator& comparator = Comparator(),
142  const Allocator& allocator = Allocator());
143 
148  uint16_t get_k() const;
149 
154  bool is_HRA() const;
155 
160  bool is_empty() const;
161 
166  uint64_t get_n() const;
167 
172  uint32_t get_num_retained() const;
173 
178  bool is_estimation_mode() const;
179 
184  template<typename FwdT>
185  void update(FwdT&& item);
186 
191  template<typename FwdSk>
192  void merge(FwdSk&& other);
193 
199  const T& get_min_item() const;
200 
206  const T& get_max_item() const;
207 
212  Comparator get_comparator() const;
213 
218  Allocator get_allocator() const;
219 
232  double get_rank(const T& item, bool inclusive = true) const;
233 
253  vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
254 
277  vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
278 
289  quantile_return_type get_quantile(double rank, bool inclusive = true) const;
290 
297  double get_rank_lower_bound(double rank, uint8_t num_std_dev) const;
298 
305  double get_rank_upper_bound(double rank, uint8_t num_std_dev) const;
306 
318  static double get_RSE(uint16_t k, double rank, bool hra, uint64_t n);
319 
326  template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
327  size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
328 
335  template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
336  size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
337 
343  template<typename SerDe = serde<T>>
344  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
345 
346  // This is a convenience alias for users
347  // The type returned by the following serialize method
348  using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
349 
358  template<typename SerDe = serde<T>>
359  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
360 
369  template<typename SerDe = serde<T>>
370  static req_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
371  const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
372 
382  template<typename SerDe = serde<T>>
383  static req_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
384  const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
385 
391  string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
392 
393  class const_iterator;
394 
400  const_iterator begin() const;
401 
408  const_iterator end() const;
409 
415 
416 private:
417  Comparator comparator_;
418  Allocator allocator_;
419  uint16_t k_;
420  bool hra_;
421  uint32_t max_nom_size_;
422  uint32_t num_retained_;
423  uint64_t n_;
424  std::vector<Compactor, AllocCompactor> compactors_;
425  optional<T> min_item_;
426  optional<T> max_item_;
428 
429  void setup_sorted_view() const; // modifies mutable state
430  void reset_sorted_view();
431 
432  static const bool LAZY_COMPRESSION = false;
433 
434  static const uint8_t SERIAL_VERSION = 1;
435  static const uint8_t FAMILY = 17;
436  static const size_t PREAMBLE_SIZE_BYTES = 8;
437  enum flags { RESERVED1, RESERVED2, IS_EMPTY, IS_HIGH_RANK, RAW_ITEMS, IS_LEVEL_ZERO_SORTED };
438 
439  static constexpr double FIXED_RSE_FACTOR = 0.084;
440  static double relative_rse_factor();
441 
442  uint8_t get_num_levels() const;
443  void grow();
444  void update_max_nom_size();
445  void update_num_retained();
446  void compress();
447 
448  static double get_rank_lb(uint16_t k, uint8_t num_levels, double rank, uint8_t num_std_dev, uint64_t n, bool hra);
449  static double get_rank_ub(uint16_t k, uint8_t num_levels, double rank, uint8_t num_std_dev, uint64_t n, bool hra);
450  static bool is_exact_rank(uint16_t k, uint8_t num_levels, double rank, uint64_t n, bool hra);
451 
452  // for deserialization
453  req_sketch(uint16_t k, bool hra, uint64_t n,
454  optional<T>&& min_item, optional<T>&& max_item,
455  std::vector<Compactor, AllocCompactor>&& compactors, const Comparator& comparator);
456 
457  static void check_preamble_ints(uint8_t preamble_ints, uint8_t num_levels);
458  static void check_serial_version(uint8_t serial_version);
459  static void check_family_id(uint8_t family_id);
460 
461  template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
462  static inline bool check_update_item(const TT& item) {
463  return !std::isnan(item);
464  }
465 
466  template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
467  static inline bool check_update_item(const TT&) {
468  return true;
469  }
470 
471  // for type converting constructor
472  template<typename TT, typename CC, typename AA> friend class req_sketch;
473 };
474 
475 template<typename T, typename C, typename A>
476 class req_sketch<T, C, A>::const_iterator {
477 public:
478  using iterator_category = std::input_iterator_tag;
479  using value_type = std::pair<const T&, const uint64_t>;
480  using difference_type = void;
481  using pointer = const return_value_holder<value_type>;
482  using reference = const value_type;
483 
484  const_iterator& operator++();
485  const_iterator& operator++(int);
486  bool operator==(const const_iterator& other) const;
487  bool operator!=(const const_iterator& other) const;
488  reference operator*() const;
489  pointer operator->() const;
490 private:
491  using LevelsIterator = typename std::vector<Compactor, AllocCompactor>::const_iterator;
492  LevelsIterator levels_it_;
493  LevelsIterator levels_end_;
494  const T* compactor_it_;
495  friend class req_sketch<T, C, A>;
496  const_iterator(LevelsIterator begin, LevelsIterator end);
497 };
498 
499 } /* namespace datasketches */
500 
501 #include "req_sketch_impl.hpp"
502 
503 #endif
Sorted view for quantiles sketches (REQ, KLL and Quantiles)
Definition: quantiles_sorted_view.hpp:38
typename std::conditional< std::is_arithmetic< T >::value, T, const T & >::type quantile_return_type
Quantile return type.
Definition: quantiles_sorted_view.hpp:93
Relative Error Quantiles Sketch.
Definition: req_sketch.hpp:79
Comparator get_comparator() const
Returns an instance of the comparator for this sketch.
Definition: req_sketch_impl.hpp:224
quantiles_sorted_view< T, Comparator, Allocator > get_sorted_view() const
Gets the sorted view of this sketch.
Definition: req_sketch_impl.hpp:269
static req_sketch deserialize(std::istream &is, const SerDe &sd=SerDe(), const Comparator &comparator=Comparator(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
req_sketch(const req_sketch< TT, CC, AA > &other, const Comparator &comparator=Comparator(), const Allocator &allocator=Allocator())
Type converting constructor.
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.
vector_double get_CDF(const T *split_points, uint32_t size, bool inclusive=true) const
Returns an approximation to the Cumulative Distribution Function (CDF), which is the cumulative analo...
Definition: req_sketch_impl.hpp:251
uint32_t get_num_retained() const
Returns the number of retained items in the sketch.
Definition: req_sketch_impl.hpp:159
vector_double get_PMF(const T *split_points, uint32_t size, bool inclusive=true) const
Returns an approximation to the Probability Mass Function (PMF) of the input stream given a set of sp...
Definition: req_sketch_impl.hpp:244
void merge(FwdSk &&other)
Merges another sketch into this one.
Definition: req_sketch_impl.hpp:188
typename quantiles_sorted_view< T, Comparator, Allocator >::quantile_return_type quantile_return_type
Quantile return type.
Definition: req_sketch.hpp:92
void update(FwdT &&item)
Updates this sketch with the given data item.
Definition: req_sketch_impl.hpp:170
void serialize(std::ostream &os, const SerDe &sd=SerDe()) const
This method serializes the sketch into a given stream in a binary form.
Definition: req_sketch_impl.hpp:369
string< Allocator > to_string(bool print_levels=false, bool print_items=false) const
Prints a summary of the sketch.
Definition: req_sketch_impl.hpp:631
uint16_t get_k() const
Returns configured parameter K.
Definition: req_sketch_impl.hpp:139
bool is_empty() const
Returns true if this sketch is empty.
Definition: req_sketch_impl.hpp:149
const T & get_max_item() const
Returns the max item of the stream.
Definition: req_sketch_impl.hpp:218
req_sketch(uint16_t k, bool hra=true, const Comparator &comparator=Comparator(), const Allocator &allocator=Allocator())
Constructor.
double get_rank_upper_bound(double rank, uint8_t num_std_dev) const
Returns an approximate upper bound of the given normalized rank.
Definition: req_sketch_impl.hpp:289
double get_rank(const T &item, bool inclusive=true) const
Returns an approximation to the normalized rank of the given item from 0 to 1 inclusive.
Definition: req_sketch_impl.hpp:234
static double get_RSE(uint16_t k, double rank, bool hra, uint64_t n)
Returns an a priori estimate of relative standard error (RSE, expressed as a number in [0,...
Definition: req_sketch_impl.hpp:294
double get_rank_lower_bound(double rank, uint8_t num_std_dev) const
Returns an approximate lower bound of the given normalized rank.
Definition: req_sketch_impl.hpp:284
Allocator get_allocator() const
Returns an instance of the allocator for this sketch.
Definition: req_sketch_impl.hpp:229
req_sketch & operator=(const req_sketch &other)
Copy assignment.
Definition: req_sketch_impl.hpp:81
quantile_return_type get_quantile(double rank, bool inclusive=true) const
Returns an approximate quantile of the given normalized rank.
Definition: req_sketch_impl.hpp:258
const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition: req_sketch_impl.hpp:724
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition: req_sketch_impl.hpp:334
const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition: req_sketch_impl.hpp:729
static req_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const Comparator &comparator=Comparator(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given array of bytes.
bool is_estimation_mode() const
Returns true if this sketch is in estimation mode.
Definition: req_sketch_impl.hpp:164
const T & get_min_item() const
Returns the min item of the stream.
Definition: req_sketch_impl.hpp:212
bool is_HRA() const
Returns configured parameter High Rank Accuracy.
Definition: req_sketch_impl.hpp:144
uint64_t get_n() const
Returns the length of the input stream.
Definition: req_sketch_impl.hpp:154
DataSketches namespace.
Definition: binomial_bounds.hpp:38