datasketches-cpp
Loading...
Searching...
No Matches
var_opt_union.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef _VAR_OPT_UNION_HPP_
21#define _VAR_OPT_UNION_HPP_
22
23#include "var_opt_sketch.hpp"
24#include "common_defs.hpp"
25#include "serde.hpp"
26
27#include <vector>
28
29namespace datasketches {
30
31template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
32
48template<
49 typename T,
50 typename A = std::allocator<T>
51>
53
54public:
55
56 explicit var_opt_union(uint32_t max_k, const A& allocator = A());
57 var_opt_union(const var_opt_union& other);
58 var_opt_union(var_opt_union&& other) noexcept;
59
61
62 var_opt_union& operator=(const var_opt_union& other);
63 var_opt_union& operator=(var_opt_union&& other);
64
72 void update(const var_opt_sketch<T, A>& sk);
73
82
88
92 void reset();
93
100 template<typename SerDe = serde<T>>
101 size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
102
103 // This is a convenience alias for users
104 // The type returned by the following serialize method
105 typedef vector_u8<A> vector_bytes;
106
116 template<typename SerDe = serde<T>>
117 vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
118
125 template<typename SerDe = serde<T>>
126 void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
127
136 template<typename SerDe = serde<T>>
137 static var_opt_union deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
138
148 template<typename SerDe = serde<T>>
149 static var_opt_union deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
150
155 string<A> to_string() const;
156
157private:
158 using AllocSketch = typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T, A>>;
159 using AllocDouble = typename std::allocator_traits<A>::template rebind_alloc<double>;
160 using AllocBool = typename std::allocator_traits<A>::template rebind_alloc<bool>;
161
162 static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
163 static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
164 static const uint8_t SER_VER = 2;
165 static const uint8_t FAMILY_ID = 14;
166 static const uint8_t EMPTY_FLAG_MASK = 4;
167
168 uint64_t n_; // cumulative over all input sketches
169
170 // outer tau is the largest tau of any input sketch
171 double outer_tau_numer_; // total weight of all input R-zones where tau = outer_tau
172
173 // total cardinality of the same R-zones, or zero if no input sketch was in estimation mode
174 uint64_t outer_tau_denom_;
175
176 uint32_t max_k_;
177
178 A allocator_;
179
180 var_opt_sketch<T, A> gadget_;
181
182 var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
183 uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator = A());
184
185 /*
186 IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
187 but in fact is NOT because it doesn't satisfy the mathematical definition
188 of a varopt sketch of the concatenated input streams. Therefore it could be different
189 from a true varopt sketch with that value of K, in which case it could easily provide
190 worse estimation accuracy for subset-sum queries.
191
192 This should not surprise you; the approximation guarantees of varopt sketches
193 do not apply to things that merely resemble varopt sketches.
194
195 However, even though the gadget is not a varopt sketch, the result
196 of the unioning process IS a varopt sketch. It is constructed by a
197 somewhat complicated "resolution" process which determines the largest K
198 that a valid varopt sketch could have given the available information,
199 then constructs a varopt sketch of that size and returns it.
200
201 However, the gadget itself is not touched during the resolution process,
202 and additional sketches could subsequently be merged into the union,
203 at which point a varopt result could again be requested.
204 */
205
206 /*
207 Explanation of "marked items" in the union's gadget:
208
209 The boolean value "true" in an pair indicates that the item
210 came from an input sketch's R zone, so it is already the result of sampling.
211
212 Therefore it must not wind up in the H zone of the final result, because
213 that would imply that the item is "exact".
214
215 However, it is okay for a marked item to hang out in the gadget's H zone for a while.
216
217 And once the item has moved to the gadget's R zone, the mark is never checked again,
218 so no effort is made to ensure that its value is preserved or even makes sense.
219 */
220
221 /*
222 Note: if the computer could perform exact real-valued arithmetic, the union could finalize
223 its result by reducing k until inner_tau > outer_tau. [Due to the vagaries of floating point
224 arithmetic, we won't attempt to detect and specially handle the inner_tau = outer_tau special
225 case.]
226
227 In fact, we won't even look at tau while while reducing k. Instead the logic will be based
228 on the more robust integer quantity num_marks_in_h_ in the gadget. It is conceivable that due
229 to round-off error we could end up with inner_tau slightly less than outer_tau, but that should
230 be fairly harmless since we will have achieved our goal of getting the marked items out of H.
231
232 Also, you might be wondering why we are bothering to maintain the numerator and denominator
233 separately instead of just having a single variable outer_tau. This allows us (in certain
234 cases) to add an input's entire R-zone weight into the result sketch, as opposed to subdividing
235 it then adding it back up. That would be a source of numerical inaccuracy. And even
236 more importantly, this design choice allows us to exactly re-construct the input sketch
237 when there is only one of them.
238 */
239 inline void merge_items(const var_opt_sketch<T, A>& sk);
240 inline void merge_items(var_opt_sketch<T, A>&& sk);
241 inline void resolve_tau(const var_opt_sketch<T, A>& sketch);
242
243 double get_outer_tau() const;
244
245 var_opt_sketch<T, A> simple_gadget_coercer() const;
246
247 bool there_exist_unmarked_h_items_lighter_than_target(double threshold) const;
248 bool detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T, A>& sk) const;
249 void mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) const;
250 void migrate_marked_items_by_decreasing_k(var_opt_sketch<T, A>& sk) const;
251
252 static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
253 static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
254};
255
256}
257
258#include "var_opt_union_impl.hpp"
259
260#endif // _VAR_OPT_UNION_HPP_
This sketch samples data from a stream of items.
Definition var_opt_sketch.hpp:72
Provides a unioning operation over var_opt_sketch objects.
Definition var_opt_union.hpp:52
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
NOTE: This method may be deprecated in a future version.
var_opt_sketch< T, A > get_result() const
Gets the varopt sketch resulting from the union of any input sketches.
Definition var_opt_union_impl.hpp:425
void update(const var_opt_sketch< T, A > &sk)
Updates this union with the given sketch This method takes an lvalue.
Definition var_opt_union_impl.hpp:323
static var_opt_union deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const A &allocator=A())
NOTE: This method may be deprecated in a future version.
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the union.
Definition var_opt_union_impl.hpp:218
void reset()
Resets the union to its default, empty state.
Definition var_opt_union_impl.hpp:301
string< A > to_string() const
Prints a summary of the union as a string.
Definition var_opt_union_impl.hpp:309
static var_opt_union deserialize(std::istream &is, const SerDe &sd=SerDe(), const A &allocator=A())
NOTE: This method may be deprecated in a future version.
DataSketches namespace.
Definition binomial_bounds.hpp:38