datasketches-cpp
Loading...
Searching...
No Matches
var_opt_union.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef _VAR_OPT_UNION_HPP_
21#define _VAR_OPT_UNION_HPP_
22
23#include "var_opt_sketch.hpp"
24#include "common_defs.hpp"
25#include "serde.hpp"
26
27#include <vector>
28
29namespace datasketches {
30
31template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
32
48template<
49 typename T,
50 typename A = std::allocator<T>
51>
53
54public:
55
56 explicit var_opt_union(uint32_t max_k, const A& allocator = A());
57 var_opt_union(const var_opt_union& other);
58 var_opt_union(var_opt_union&& other) noexcept;
59
61
62 var_opt_union& operator=(const var_opt_union& other);
63 var_opt_union& operator=(var_opt_union&& other);
64
70 void update(const var_opt_sketch<T, A>& sk);
71
78
84
88 void reset();
89
96 template<typename SerDe = serde<T>>
97 size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
98
99 // This is a convenience alias for users
100 // The type returned by the following serialize method
101 typedef vector_u8<A> vector_bytes;
102
112 template<typename SerDe = serde<T>>
113 vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
114
121 template<typename SerDe = serde<T>>
122 void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
123
132 template<typename SerDe = serde<T>>
133 static var_opt_union deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
134
144 template<typename SerDe = serde<T>>
145 static var_opt_union deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
146
151 string<A> to_string() const;
152
153private:
154 using AllocSketch = typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T, A>>;
155 using AllocDouble = typename std::allocator_traits<A>::template rebind_alloc<double>;
156 using AllocBool = typename std::allocator_traits<A>::template rebind_alloc<bool>;
157
158 static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
159 static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
160 static const uint8_t SER_VER = 2;
161 static const uint8_t FAMILY_ID = 14;
162 static const uint8_t EMPTY_FLAG_MASK = 4;
163
164 uint64_t n_; // cumulative over all input sketches
165
166 // outer tau is the largest tau of any input sketch
167 double outer_tau_numer_; // total weight of all input R-zones where tau = outer_tau
168
169 // total cardinality of the same R-zones, or zero if no input sketch was in estimation mode
170 uint64_t outer_tau_denom_;
171
172 uint32_t max_k_;
173
174 A allocator_;
175
176 var_opt_sketch<T, A> gadget_;
177
178 var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
179 uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator = A());
180
181 /*
182 IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
183 but in fact is NOT because it doesn't satisfy the mathematical definition
184 of a varopt sketch of the concatenated input streams. Therefore it could be different
185 from a true varopt sketch with that value of K, in which case it could easily provide
186 worse estimation accuracy for subset-sum queries.
187
188 This should not surprise you; the approximation guarantees of varopt sketches
189 do not apply to things that merely resemble varopt sketches.
190
191 However, even though the gadget is not a varopt sketch, the result
192 of the unioning process IS a varopt sketch. It is constructed by a
193 somewhat complicated "resolution" process which determines the largest K
194 that a valid varopt sketch could have given the available information,
195 then constructs a varopt sketch of that size and returns it.
196
197 However, the gadget itself is not touched during the resolution process,
198 and additional sketches could subsequently be merged into the union,
199 at which point a varopt result could again be requested.
200 */
201
202 /*
203 Explanation of "marked items" in the union's gadget:
204
205 The boolean value "true" in an pair indicates that the item
206 came from an input sketch's R zone, so it is already the result of sampling.
207
208 Therefore it must not wind up in the H zone of the final result, because
209 that would imply that the item is "exact".
210
211 However, it is okay for a marked item to hang out in the gadget's H zone for a while.
212
213 And once the item has moved to the gadget's R zone, the mark is never checked again,
214 so no effort is made to ensure that its value is preserved or even makes sense.
215 */
216
217 /*
218 Note: if the computer could perform exact real-valued arithmetic, the union could finalize
219 its result by reducing k until inner_tau > outer_tau. [Due to the vagaries of floating point
220 arithmetic, we won't attempt to detect and specially handle the inner_tau = outer_tau special
221 case.]
222
223 In fact, we won't even look at tau while while reducing k. Instead the logic will be based
224 on the more robust integer quantity num_marks_in_h_ in the gadget. It is conceivable that due
225 to round-off error we could end up with inner_tau slightly less than outer_tau, but that should
226 be fairly harmless since we will have achieved our goal of getting the marked items out of H.
227
228 Also, you might be wondering why we are bothering to maintain the numerator and denominator
229 separately instead of just having a single variable outer_tau. This allows us (in certain
230 cases) to add an input's entire R-zone weight into the result sketch, as opposed to subdividing
231 it then adding it back up. That would be a source of numerical inaccuracy. And even
232 more importantly, this design choice allows us to exactly re-construct the input sketch
233 when there is only one of them.
234 */
235 inline void merge_items(const var_opt_sketch<T, A>& sk);
236 inline void merge_items(var_opt_sketch<T, A>&& sk);
237 inline void resolve_tau(const var_opt_sketch<T, A>& sketch);
238
239 double get_outer_tau() const;
240
241 var_opt_sketch<T, A> simple_gadget_coercer() const;
242
243 bool there_exist_unmarked_h_items_lighter_than_target(double threshold) const;
244 bool detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T, A>& sk) const;
245 void mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) const;
246 void migrate_marked_items_by_decreasing_k(var_opt_sketch<T, A>& sk) const;
247
248 static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
249 static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
250};
251
252}
253
254#include "var_opt_union_impl.hpp"
255
256#endif // _VAR_OPT_UNION_HPP_
This sketch samples data from a stream of items.
Definition var_opt_sketch.hpp:67
Provides a unioning operation over var_opt_sketch objects.
Definition var_opt_union.hpp:52
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
NOTE: This method may be deprecated in a future version.
var_opt_sketch< T, A > get_result() const
Gets the varopt sketch resulting from the union of any input sketches.
Definition var_opt_union_impl.hpp:425
void update(const var_opt_sketch< T, A > &sk)
Updates this union with the given sketch This method takes an lvalue.
Definition var_opt_union_impl.hpp:323
static var_opt_union deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const A &allocator=A())
NOTE: This method may be deprecated in a future version.
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the union.
Definition var_opt_union_impl.hpp:218
void reset()
Resets the union to its default, empty state.
Definition var_opt_union_impl.hpp:301
string< A > to_string() const
Prints a summary of the union as a string.
Definition var_opt_union_impl.hpp:309
static var_opt_union deserialize(std::istream &is, const SerDe &sd=SerDe(), const A &allocator=A())
NOTE: This method may be deprecated in a future version.
DataSketches namespace.
Definition binomial_bounds.hpp:38