datasketches-cpp
ebpps_sketch.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef _EBPPS_SKETCH_HPP_
21 #define _EBPPS_SKETCH_HPP_
22 
23 #include "common_defs.hpp"
24 #include "ebpps_sample.hpp"
25 #include "optional.hpp"
26 #include "serde.hpp"
27 
28 #include <random>
29 #include <string>
30 #include <vector>
31 
32 namespace datasketches {
33 
35 namespace ebpps_constants {
37  const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
38 }
39 
55 template<
56  typename T,
57  typename A = std::allocator<T>
58 >
59 class ebpps_sketch {
60 
61  public:
62  static const uint32_t MAX_K = ebpps_constants::MAX_K;
63 
69  explicit ebpps_sketch(uint32_t k, const A& allocator = A());
70 
77  void update(const T& item, double weight = 1.0);
78 
85  void update(T&& item, double weight = 1.0);
86 
92  void merge(const ebpps_sketch<T, A>& sketch);
93 
99  void merge(ebpps_sketch<T, A>&& sketch);
100 
101  using result_type = typename ebpps_sample<T,A>::result_type;
102 
106  result_type get_result() const;
107 
112  inline uint32_t get_k() const;
113 
119  inline uint64_t get_n() const;
120 
125  inline double get_cumulative_weight() const;
126 
139  inline double get_c() const;
140 
145  inline bool is_empty() const;
146 
151  A get_allocator() const;
152 
156  void reset();
157 
163  template<typename SerDe = serde<T>>
164  inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
165 
166  // This is a convenience alias for users
167  // The type returned by the following serialize method
168  using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
169 
178  template<typename SerDe = serde<T>>
179  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
180 
186  template<typename SerDe = serde<T>>
187  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
188 
197  template<typename SerDe = serde<T>>
198  static ebpps_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
199 
207  template<typename SerDe = serde<T>>
208  static ebpps_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
209 
214  string<A> to_string() const;
215 
224  string<A> items_to_string() const;
225 
231  typename ebpps_sample<T,A>::const_iterator begin() const;
232 
239  typename ebpps_sample<T,A>::const_iterator end() const;
240 
241  private:
242  static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
243  static const uint8_t PREAMBLE_LONGS_FULL = 5; // C is part of sample_
244  static const uint8_t SER_VER = 1;
245  static const uint8_t FAMILY_ID = 19;
246  static const uint8_t EMPTY_FLAG_MASK = 4;
247  static const uint8_t HAS_PARTIAL_ITEM_MASK = 8;
248 
249  A allocator_;
250  uint32_t k_; // max size of sketch, in items
251  uint64_t n_; // total number of items processed by the sketch
252 
253  double cumulative_wt_; // total weight of items processed by the sketch
254  double wt_max_; // maximum weight seen so far
255  double rho_; // latest scaling parameter for downsampling
256 
257  ebpps_sample<T,A> sample_; // Object holding the current state of the sample
258 
259  ebpps_sample<T,A> tmp_; // Temporary sample of size 1 used in updates
260 
261  // handles merge after ensuring other.cumulative_wt_ <= this->cumulative_wt_
262  // so we can send items in individually
263  template<typename O>
264  void internal_merge(O&& other);
265 
266  ebpps_sketch(uint32_t k, uint64_t n, double cumulative_wt, double wt_max, double rho,
267  ebpps_sample<T,A>&& sample, const A& allocator = A());
268 
269  template<typename FwdItem>
270  inline void internal_update(FwdItem&& item, double weight);
271 
272  // validation
273  static uint32_t check_k(uint32_t k);
274  static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
275  static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
276  static uint32_t validate_and_get_target_size(uint32_t preamble_longs, uint32_t k, uint64_t n);
277 };
278 
279 } // namespace datasketches
280 
281 #include "ebpps_sketch_impl.hpp"
282 
283 #endif // _EBPPS_SKETCH_HPP_
An implementation of an Exact and Bounded Sampling Proportional to Size sketch.
Definition: ebpps_sketch.hpp:59
string< A > items_to_string() const
Prints the raw sketch items to a string.
Definition: ebpps_sketch_impl.hpp:112
void update(const T &item, double weight=1.0)
Updates this sketch with the given data item with the given weight.
Definition: ebpps_sketch_impl.hpp:127
ebpps_sketch(uint32_t k, const A &allocator=A())
Constructor.
Definition: ebpps_sketch_impl.hpp:36
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.
ebpps_sample< T, A >::const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition: ebpps_sketch_impl.hpp:524
bool is_empty() const
Returns true if the sketch is empty.
Definition: ebpps_sketch_impl.hpp:82
ebpps_sample< T, A >::const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition: ebpps_sketch_impl.hpp:519
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition: ebpps_sketch_impl.hpp:320
double get_cumulative_weight() const
Returns the cumulative weight of items processed by the sketch.
Definition: ebpps_sketch_impl.hpp:77
A get_allocator() const
Returns an instance of the allocator for this sketch.
Definition: ebpps_sketch_impl.hpp:122
void merge(const ebpps_sketch< T, A > &sketch)
Merges the provided sketch into the current one.
Definition: ebpps_sketch_impl.hpp:199
static ebpps_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given array of bytes.
void reset()
Resets the sketch to its default, empty state.
Definition: ebpps_sketch_impl.hpp:87
double get_c() const
Returns the expected number of samples returned upon a call to get_result() or the creation of an ite...
Definition: ebpps_sketch_impl.hpp:72
static ebpps_sketch deserialize(std::istream &is, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given stream.
string< A > to_string() const
Prints a summary of the sketch.
Definition: ebpps_sketch_impl.hpp:96
result_type get_result() const
Returns a copy of the current sample, as a std::vector.
Definition: ebpps_sketch_impl.hpp:163
uint32_t get_k() const
Returns the configured maximum sample size.
Definition: ebpps_sketch_impl.hpp:62
uint64_t get_n() const
Returns the number of items processed by the sketch, regardless of item weight.
Definition: ebpps_sketch_impl.hpp:67
const uint32_t MAX_K
maximum value of parameter K
Definition: ebpps_sketch.hpp:37
DataSketches namespace.
Definition: binomial_bounds.hpp:38