datasketches-cpp
Loading...
Searching...
No Matches
ebpps_sketch.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef _EBPPS_SKETCH_HPP_
21#define _EBPPS_SKETCH_HPP_
22
23#include "common_defs.hpp"
24#include "ebpps_sample.hpp"
25#include "optional.hpp"
26#include "serde.hpp"
27
28#include <random>
29#include <string>
30#include <vector>
31
32namespace datasketches {
33
35namespace ebpps_constants {
37 const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
38}
39
55template<
56 typename T,
57 typename A = std::allocator<T>
58>
60
61 public:
62 static const uint32_t MAX_K = ebpps_constants::MAX_K;
63
69 explicit ebpps_sketch(uint32_t k, const A& allocator = A());
70
77 void update(const T& item, double weight = 1.0);
78
85 void update(T&& item, double weight = 1.0);
86
92 void merge(const ebpps_sketch<T, A>& sketch);
93
99 void merge(ebpps_sketch<T, A>&& sketch);
100
101 using result_type = typename ebpps_sample<T,A>::result_type;
102
106 result_type get_result() const;
107
112 inline uint32_t get_k() const;
113
119 inline uint64_t get_n() const;
120
125 inline double get_cumulative_weight() const;
126
139 inline double get_c() const;
140
145 inline bool is_empty() const;
146
151 A get_allocator() const;
152
156 void reset();
157
163 template<typename SerDe = serde<T>>
164 inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
165
166 // This is a convenience alias for users
167 // The type returned by the following serialize method
168 using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
169
178 template<typename SerDe = serde<T>>
179 vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
180
186 template<typename SerDe = serde<T>>
187 void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
188
197 template<typename SerDe = serde<T>>
198 static ebpps_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
199
207 template<typename SerDe = serde<T>>
208 static ebpps_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
209
214 string<A> to_string() const;
215
224 string<A> items_to_string() const;
225
231 typename ebpps_sample<T,A>::const_iterator begin() const;
232
239 typename ebpps_sample<T,A>::const_iterator end() const;
240
241 private:
242 static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
243 static const uint8_t PREAMBLE_LONGS_FULL = 5; // C is part of sample_
244 static const uint8_t SER_VER = 1;
245 static const uint8_t FAMILY_ID = 19;
246 static const uint8_t EMPTY_FLAG_MASK = 4;
247 static const uint8_t HAS_PARTIAL_ITEM_MASK = 8;
248
249 A allocator_;
250 uint32_t k_; // max size of sketch, in items
251 uint64_t n_; // total number of items processed by the sketch
252
253 double cumulative_wt_; // total weight of items processed by the sketch
254 double wt_max_; // maximum weight seen so far
255 double rho_; // latest scaling parameter for downsampling
256
257 ebpps_sample<T,A> sample_; // Object holding the current state of the sample
258
259 ebpps_sample<T,A> tmp_; // Temporary sample of size 1 used in updates
260
261 // handles merge after ensuring other.cumulative_wt_ <= this->cumulative_wt_
262 // so we can send items in individually
263 template<typename O>
264 void internal_merge(O&& other);
265
266 ebpps_sketch(uint32_t k, uint64_t n, double cumulative_wt, double wt_max, double rho,
267 ebpps_sample<T,A>&& sample, const A& allocator = A());
268
269 template<typename FwdItem>
270 inline void internal_update(FwdItem&& item, double weight);
271
272 // validation
273 static uint32_t check_k(uint32_t k);
274 static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
275 static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
276 static uint32_t validate_and_get_target_size(uint32_t preamble_longs, uint32_t k, uint64_t n);
277};
278
279} // namespace datasketches
280
281#include "ebpps_sketch_impl.hpp"
282
283#endif // _EBPPS_SKETCH_HPP_
An implementation of an Exact and Bounded Sampling Proportional to Size sketch.
Definition ebpps_sketch.hpp:59
string< A > items_to_string() const
Prints the raw sketch items to a string.
Definition ebpps_sketch_impl.hpp:112
void update(const T &item, double weight=1.0)
Updates this sketch with the given data item with the given weight.
Definition ebpps_sketch_impl.hpp:127
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.
ebpps_sample< T, A >::const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition ebpps_sketch_impl.hpp:524
bool is_empty() const
Returns true if the sketch is empty.
Definition ebpps_sketch_impl.hpp:82
ebpps_sample< T, A >::const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition ebpps_sketch_impl.hpp:519
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition ebpps_sketch_impl.hpp:320
double get_cumulative_weight() const
Returns the cumulative weight of items processed by the sketch.
Definition ebpps_sketch_impl.hpp:77
A get_allocator() const
Returns an instance of the allocator for this sketch.
Definition ebpps_sketch_impl.hpp:122
void merge(const ebpps_sketch< T, A > &sketch)
Merges the provided sketch into the current one.
Definition ebpps_sketch_impl.hpp:199
static ebpps_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given array of bytes.
void reset()
Resets the sketch to its default, empty state.
Definition ebpps_sketch_impl.hpp:87
double get_c() const
Returns the expected number of samples returned upon a call to get_result() or the creation of an ite...
Definition ebpps_sketch_impl.hpp:72
static ebpps_sketch deserialize(std::istream &is, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given stream.
string< A > to_string() const
Prints a summary of the sketch.
Definition ebpps_sketch_impl.hpp:96
result_type get_result() const
Returns a copy of the current sample, as a std::vector.
Definition ebpps_sketch_impl.hpp:163
uint32_t get_k() const
Returns the configured maximum sample size.
Definition ebpps_sketch_impl.hpp:62
uint64_t get_n() const
Returns the number of items processed by the sketch, regardless of item weight.
Definition ebpps_sketch_impl.hpp:67
const uint32_t MAX_K
maximum value of parameter K
Definition ebpps_sketch.hpp:37
DataSketches namespace.
Definition binomial_bounds.hpp:38