datasketches-cpp
var_opt_union.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef _VAR_OPT_UNION_HPP_
21 #define _VAR_OPT_UNION_HPP_
22 
23 #include "var_opt_sketch.hpp"
24 #include "common_defs.hpp"
25 #include "serde.hpp"
26 
27 #include <vector>
28 
29 namespace datasketches {
30 
31 template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
32 
48 template<
49  typename T,
50  typename A = std::allocator<T>
51 >
53 
54 public:
55 
56  explicit var_opt_union(uint32_t max_k, const A& allocator = A());
57  var_opt_union(const var_opt_union& other);
58  var_opt_union(var_opt_union&& other) noexcept;
59 
60  ~var_opt_union();
61 
62  var_opt_union& operator=(const var_opt_union& other);
63  var_opt_union& operator=(var_opt_union&& other);
64 
70  void update(const var_opt_sketch<T, A>& sk);
71 
77  void update(var_opt_sketch<T, A>&& sk);
78 
84 
88  void reset();
89 
96  template<typename SerDe = serde<T>>
97  size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
98 
99  // This is a convenience alias for users
100  // The type returned by the following serialize method
101  typedef vector_u8<A> vector_bytes;
102 
112  template<typename SerDe = serde<T>>
113  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
114 
121  template<typename SerDe = serde<T>>
122  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
123 
132  template<typename SerDe = serde<T>>
133  static var_opt_union deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
134 
144  template<typename SerDe = serde<T>>
145  static var_opt_union deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
146 
151  string<A> to_string() const;
152 
153 private:
154  using AllocSketch = typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T, A>>;
155  using AllocDouble = typename std::allocator_traits<A>::template rebind_alloc<double>;
156  using AllocBool = typename std::allocator_traits<A>::template rebind_alloc<bool>;
157 
158  static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
159  static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
160  static const uint8_t SER_VER = 2;
161  static const uint8_t FAMILY_ID = 14;
162  static const uint8_t EMPTY_FLAG_MASK = 4;
163 
164  uint64_t n_; // cumulative over all input sketches
165 
166  // outer tau is the largest tau of any input sketch
167  double outer_tau_numer_; // total weight of all input R-zones where tau = outer_tau
168 
169  // total cardinality of the same R-zones, or zero if no input sketch was in estimation mode
170  uint64_t outer_tau_denom_;
171 
172  uint32_t max_k_;
173 
174  A allocator_;
175 
176  var_opt_sketch<T, A> gadget_;
177 
178  var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
179  uint32_t max_k, var_opt_sketch<T, A>&& gadget, const A& allocator = A());
180 
181  /*
182  IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
183  but in fact is NOT because it doesn't satisfy the mathematical definition
184  of a varopt sketch of the concatenated input streams. Therefore it could be different
185  from a true varopt sketch with that value of K, in which case it could easily provide
186  worse estimation accuracy for subset-sum queries.
187 
188  This should not surprise you; the approximation guarantees of varopt sketches
189  do not apply to things that merely resemble varopt sketches.
190 
191  However, even though the gadget is not a varopt sketch, the result
192  of the unioning process IS a varopt sketch. It is constructed by a
193  somewhat complicated "resolution" process which determines the largest K
194  that a valid varopt sketch could have given the available information,
195  then constructs a varopt sketch of that size and returns it.
196 
197  However, the gadget itself is not touched during the resolution process,
198  and additional sketches could subsequently be merged into the union,
199  at which point a varopt result could again be requested.
200  */
201 
202  /*
203  Explanation of "marked items" in the union's gadget:
204 
205  The boolean value "true" in an pair indicates that the item
206  came from an input sketch's R zone, so it is already the result of sampling.
207 
208  Therefore it must not wind up in the H zone of the final result, because
209  that would imply that the item is "exact".
210 
211  However, it is okay for a marked item to hang out in the gadget's H zone for a while.
212 
213  And once the item has moved to the gadget's R zone, the mark is never checked again,
214  so no effort is made to ensure that its value is preserved or even makes sense.
215  */
216 
217  /*
218  Note: if the computer could perform exact real-valued arithmetic, the union could finalize
219  its result by reducing k until inner_tau > outer_tau. [Due to the vagaries of floating point
220  arithmetic, we won't attempt to detect and specially handle the inner_tau = outer_tau special
221  case.]
222 
223  In fact, we won't even look at tau while while reducing k. Instead the logic will be based
224  on the more robust integer quantity num_marks_in_h_ in the gadget. It is conceivable that due
225  to round-off error we could end up with inner_tau slightly less than outer_tau, but that should
226  be fairly harmless since we will have achieved our goal of getting the marked items out of H.
227 
228  Also, you might be wondering why we are bothering to maintain the numerator and denominator
229  separately instead of just having a single variable outer_tau. This allows us (in certain
230  cases) to add an input's entire R-zone weight into the result sketch, as opposed to subdividing
231  it then adding it back up. That would be a source of numerical inaccuracy. And even
232  more importantly, this design choice allows us to exactly re-construct the input sketch
233  when there is only one of them.
234  */
235  inline void merge_items(const var_opt_sketch<T, A>& sk);
236  inline void merge_items(var_opt_sketch<T, A>&& sk);
237  inline void resolve_tau(const var_opt_sketch<T, A>& sketch);
238 
239  double get_outer_tau() const;
240 
241  var_opt_sketch<T, A> simple_gadget_coercer() const;
242 
243  bool there_exist_unmarked_h_items_lighter_than_target(double threshold) const;
244  bool detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T, A>& sk) const;
245  void mark_moving_gadget_coercer(var_opt_sketch<T, A>& sk) const;
246  void migrate_marked_items_by_decreasing_k(var_opt_sketch<T, A>& sk) const;
247 
248  static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
249  static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
250 };
251 
252 }
253 
254 #include "var_opt_union_impl.hpp"
255 
256 #endif // _VAR_OPT_UNION_HPP_
Provides a unioning operation over var_opt_sketch objects.
Definition: var_opt_union.hpp:52
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
NOTE: This method may be deprecated in a future version.
var_opt_sketch< T, A > get_result() const
Gets the varopt sketch resulting from the union of any input sketches.
Definition: var_opt_union_impl.hpp:425
void update(const var_opt_sketch< T, A > &sk)
Updates this union with the given sketch This method takes an lvalue.
Definition: var_opt_union_impl.hpp:323
static var_opt_union deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const A &allocator=A())
NOTE: This method may be deprecated in a future version.
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the union.
Definition: var_opt_union_impl.hpp:218
void reset()
Resets the union to its default, empty state.
Definition: var_opt_union_impl.hpp:301
string< A > to_string() const
Prints a summary of the union as a string.
Definition: var_opt_union_impl.hpp:309
static var_opt_union deserialize(std::istream &is, const SerDe &sd=SerDe(), const A &allocator=A())
NOTE: This method may be deprecated in a future version.
DataSketches namespace.
Definition: binomial_bounds.hpp:38