datasketches-cpp
tuple_sketch.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef TUPLE_SKETCH_HPP_
21 #define TUPLE_SKETCH_HPP_
22 
23 #include <string>
24 
25 #include "serde.hpp"
26 #include "theta_update_sketch_base.hpp"
27 
28 namespace datasketches {
29 
30 // forward declarations
31 template<typename S, typename A> class tuple_sketch;
32 template<typename S, typename U, typename P, typename A> class update_tuple_sketch;
33 template<typename S, typename A> class compact_tuple_sketch;
34 template<typename A> class theta_sketch_alloc;
35 
36 template<typename K, typename V>
37 struct pair_extract_key {
38  K& operator()(std::pair<K, V>& entry) const {
39  return entry.first;
40  }
41  const K& operator()(const std::pair<K, V>& entry) const {
42  return entry.first;
43  }
44 };
45 
50 template<
51  typename Summary,
52  typename Allocator = std::allocator<Summary>
53 >
54 class tuple_sketch {
55 public:
56  using Entry = std::pair<uint64_t, Summary>;
57  using ExtractKey = pair_extract_key<uint64_t, Summary>;
58  using iterator = theta_iterator<Entry, ExtractKey>;
59  using const_iterator = theta_const_iterator<Entry, ExtractKey>;
60 
61  virtual ~tuple_sketch() = default;
62 
66  virtual Allocator get_allocator() const = 0;
67 
71  virtual bool is_empty() const = 0;
72 
76  double get_estimate() const;
77 
87  double get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
88 
96  double get_lower_bound(uint8_t num_std_devs) const;
97 
98 
108  double get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
109 
110 
118  double get_upper_bound(uint8_t num_std_devs) const;
119 
123  bool is_estimation_mode() const;
124 
128  double get_theta() const;
129 
133  virtual uint64_t get_theta64() const = 0;
134 
138  virtual uint32_t get_num_retained() const = 0;
139 
143  virtual uint16_t get_seed_hash() const = 0;
144 
148  virtual bool is_ordered() const = 0;
149 
155  string<Allocator> to_string(bool print_items = false) const;
156 
161  virtual iterator begin() = 0;
162 
168  virtual iterator end() = 0;
169 
174  virtual const_iterator begin() const = 0;
175 
181  virtual const_iterator end() const = 0;
182 
183 protected:
184  virtual void print_specifics(std::ostringstream& os) const = 0;
185 
186  static uint16_t get_seed_hash(uint64_t seed);
187 
188  static void check_sketch_type(uint8_t actual, uint8_t expected);
189  static void check_serial_version(uint8_t actual, uint8_t expected);
190  static void check_seed_hash(uint16_t actual, uint16_t expected);
191 };
192 
193 // update sketch
194 
195 // for types with defined default constructor and + operation
196 template<typename Summary, typename Update>
197 struct default_tuple_update_policy {
198  Summary create() const {
199  return Summary();
200  }
201  void update(Summary& summary, const Update& update) const {
202  summary += update;
203  }
204 };
205 
211 template<
212  typename Summary,
213  typename Update = Summary,
214  typename Policy = default_tuple_update_policy<Summary, Update>,
215  typename Allocator = std::allocator<Summary>
216 >
217 class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
218 public:
220  using Entry = typename Base::Entry;
221  using ExtractKey = typename Base::ExtractKey;
222  using iterator = typename Base::iterator;
223  using const_iterator = typename Base::const_iterator;
224  using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
225  using tuple_map = theta_update_sketch_base<Entry, ExtractKey, AllocEntry>;
226  using resize_factor = typename tuple_map::resize_factor;
227 
228  // No constructor here. Use builder instead.
229  class builder;
230 
231  update_tuple_sketch(const update_tuple_sketch&) = default;
232  update_tuple_sketch(update_tuple_sketch&&) noexcept = default;
233  virtual ~update_tuple_sketch() = default;
234  update_tuple_sketch& operator=(const update_tuple_sketch&) = default;
235  update_tuple_sketch& operator=(update_tuple_sketch&&) = default;
236 
237  virtual Allocator get_allocator() const;
238  virtual bool is_empty() const;
239  virtual bool is_ordered() const;
240  virtual uint64_t get_theta64() const;
241  virtual uint32_t get_num_retained() const;
242  virtual uint16_t get_seed_hash() const;
243 
247  uint8_t get_lg_k() const;
248 
252  resize_factor get_rf() const;
253 
259  template<typename FwdUpdate>
260  inline void update(const std::string& key, FwdUpdate&& value);
261 
267  template<typename FwdUpdate>
268  inline void update(uint64_t key, FwdUpdate&& value);
269 
275  template<typename FwdUpdate>
276  inline void update(int64_t key, FwdUpdate&& value);
277 
284  template<typename FwdUpdate>
285  inline void update(uint32_t key, FwdUpdate&& value);
286 
293  template<typename FwdUpdate>
294  inline void update(int32_t key, FwdUpdate&& value);
295 
302  template<typename FwdUpdate>
303  inline void update(uint16_t key, FwdUpdate&& value);
304 
311  template<typename FwdUpdate>
312  inline void update(int16_t key, FwdUpdate&& value);
313 
320  template<typename FwdUpdate>
321  inline void update(uint8_t key, FwdUpdate&& value);
322 
329  template<typename FwdUpdate>
330  inline void update(int8_t key, FwdUpdate&& value);
331 
338  template<typename FwdUpdate>
339  inline void update(double key, FwdUpdate&& value);
340 
347  template<typename FwdUpdate>
348  inline void update(float key, FwdUpdate&& value);
349 
364  template<typename FwdUpdate>
365  void update(const void* key, size_t length, FwdUpdate&& value);
366 
370  void trim();
371 
375  void reset();
376 
382  compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
383 
384  virtual iterator begin();
385  virtual iterator end();
386  virtual const_iterator begin() const;
387  virtual const_iterator end() const;
388 
389 protected:
390  Policy policy_;
391  tuple_map map_;
392 
393  // for builder
394  update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
395 
396  virtual void print_specifics(std::ostringstream& os) const;
397 };
398 
403 template<
404  typename Summary,
405  typename Allocator = std::allocator<Summary>
406 >
407 class compact_tuple_sketch: public tuple_sketch<Summary, Allocator> {
408 public:
410  using Entry = typename Base::Entry;
411  using ExtractKey = typename Base::ExtractKey;
412  using iterator = typename Base::iterator;
413  using const_iterator = typename Base::const_iterator;
414  using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
415  using AllocU64 = typename std::allocator_traits<Allocator>::template rebind_alloc<uint64_t>;
416  using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
417  using vector_bytes = std::vector<uint8_t, AllocBytes>;
418  using comparator = compare_by_key<ExtractKey>;
419 
420  static const uint8_t SERIAL_VERSION_LEGACY = 1;
421  static const uint8_t SERIAL_VERSION = 3;
422  static const uint8_t SKETCH_FAMILY = 9;
423  static const uint8_t SKETCH_TYPE = 1;
424  static const uint8_t SKETCH_TYPE_LEGACY = 5;
425  enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
426 
427  // Instances of this type can be obtained:
428  // - by compacting an update_tuple_sketch
429  // - as a result of a set operation
430  // - by deserializing a previously serialized compact sketch
431 
438  compact_tuple_sketch(const Base& other, bool ordered);
439 
445 
451 
452  virtual ~compact_tuple_sketch() = default;
453 
459  compact_tuple_sketch& operator=(const compact_tuple_sketch& other) = default;
460 
466  compact_tuple_sketch& operator=(compact_tuple_sketch&& other) = default;
467 
474  compact_tuple_sketch(const theta_sketch_alloc<AllocU64>& other, const Summary& summary, bool ordered = true);
475 
476  virtual Allocator get_allocator() const;
477  virtual bool is_empty() const;
478  virtual bool is_ordered() const;
479  virtual uint64_t get_theta64() const;
480  virtual uint32_t get_num_retained() const;
481  virtual uint16_t get_seed_hash() const;
482 
488  template<typename SerDe = serde<Summary>>
489  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
490 
500  template<typename SerDe = serde<Summary>>
501  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
502 
503  virtual iterator begin();
504  virtual iterator end();
505  virtual const_iterator begin() const;
506  virtual const_iterator end() const;
507 
516  template<typename SerDe = serde<Summary>>
517  static compact_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED,
518  const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
519 
529  template<typename SerDe = serde<Summary>>
530  static compact_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED,
531  const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
532 
533 protected:
534  bool is_empty_;
535  bool is_ordered_;
536  uint16_t seed_hash_;
537  uint64_t theta_;
538  std::vector<Entry, AllocEntry> entries_;
539 
545  template<typename SerDe, typename SS = Summary, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type = 0>
546  size_t get_serialized_size_summaries_bytes(const SerDe& sd) const;
547 
553  template<typename SerDe, typename SS = Summary, typename std::enable_if<!std::is_arithmetic<SS>::value, int>::type = 0>
554  size_t get_serialized_size_summaries_bytes(const SerDe& sd) const;
555 
556  // for deserialize
557  class deleter_of_summaries {
558  public:
559  deleter_of_summaries(uint32_t num, bool destroy, const Allocator& allocator):
560  allocator_(allocator), num_(num), destroy_(destroy) {}
561  void set_destroy(bool destroy) { destroy_ = destroy; }
562  void operator() (Summary* ptr) {
563  if (ptr != nullptr) {
564  if (destroy_) {
565  for (uint32_t i = 0; i < num_; ++i) ptr[i].~Summary();
566  }
567  allocator_.deallocate(ptr, num_);
568  }
569  }
570  private:
571  Allocator allocator_;
572  uint32_t num_;
573  bool destroy_;
574  };
575 
576  virtual void print_specifics(std::ostringstream& os) const;
577 
578  template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_union_base;
579  template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
580  template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
581  compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
582 
583 };
584 
586 template<typename Derived, typename Policy, typename Allocator>
587 class tuple_base_builder: public theta_base_builder<Derived, Allocator> {
588 public:
589  tuple_base_builder(const Policy& policy, const Allocator& allocator);
590 
591 protected:
592  Policy policy_;
593 };
594 
596 template<typename S, typename U, typename P, typename A>
597 class update_tuple_sketch<S, U, P, A>::builder: public tuple_base_builder<builder, P, A> {
598 public:
605  builder(const P& policy = P(), const A& allocator = A());
606 
611  update_tuple_sketch<S, U, P, A> build() const;
612 };
613 
614 } /* namespace datasketches */
615 
616 #include "tuple_sketch_impl.hpp"
617 
618 #endif
Compact Tuple sketch.
Definition: tuple_sketch.hpp:407
virtual uint64_t get_theta64() const
Definition: tuple_sketch_impl.hpp:333
virtual uint32_t get_num_retained() const
Definition: tuple_sketch_impl.hpp:338
void serialize(std::ostream &os, const SerDe &sd=SerDe()) const
This method serializes the sketch into a given stream in a binary form.
Definition: tuple_sketch_impl.hpp:368
virtual bool is_empty() const
Definition: tuple_sketch_impl.hpp:323
virtual bool is_ordered() const
Definition: tuple_sketch_impl.hpp:328
virtual uint16_t get_seed_hash() const
Definition: tuple_sketch_impl.hpp:343
virtual iterator end()
Iterator pointing past the valid range.
Definition: tuple_sketch_impl.hpp:569
compact_tuple_sketch(const Base &other, bool ordered)
Copy constructor.
Definition: tuple_sketch_impl.hpp:281
virtual Allocator get_allocator() const
Definition: tuple_sketch_impl.hpp:318
static compact_tuple_sketch deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const SerDe &sd=SerDe(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
virtual iterator begin()
Iterator over entries in this sketch.
Definition: tuple_sketch_impl.hpp:564
compact_tuple_sketch(const compact_tuple_sketch &other)=default
Copy constructor.
size_t get_serialized_size_summaries_bytes(const SerDe &sd) const
Computes size needed to serialize summaries in the sketch.
Theta base builder.
Definition: theta_update_sketch_base.hpp:97
Base class for the Theta Sketch, a generalization of the Kth Minimum Value (KMV) sketch.
Definition: theta_sketch.hpp:127
Tuple base builder.
Definition: tuple_sketch.hpp:587
Base class for Tuple sketch.
Definition: tuple_sketch.hpp:54
double get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const
Returns the approximate upper error bound given a number of standard deviations over an arbitrary num...
Definition: tuple_sketch_impl.hpp:57
virtual const_iterator begin() const =0
Const iterator over entries in this sketch.
double get_estimate() const
Definition: tuple_sketch_impl.hpp:40
virtual bool is_ordered() const =0
virtual const_iterator end() const =0
Const iterator pointing past the valid range.
virtual bool is_empty() const =0
double get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const
Returns the approximate lower error bound given a number of standard deviations over an arbitrary num...
Definition: tuple_sketch_impl.hpp:45
string< Allocator > to_string(bool print_items=false) const
Provides a human-readable summary of this sketch as a string.
Definition: tuple_sketch_impl.hpp:69
virtual uint32_t get_num_retained() const =0
double get_theta() const
Definition: tuple_sketch_impl.hpp:34
virtual iterator end()=0
Iterator pointing past the valid range.
virtual uint16_t get_seed_hash() const =0
virtual iterator begin()=0
Iterator over entries in this sketch.
bool is_estimation_mode() const
Definition: tuple_sketch_impl.hpp:29
virtual Allocator get_allocator() const =0
virtual uint64_t get_theta64() const =0
Update Tuple sketch builder.
Definition: tuple_sketch.hpp:597
Update Tuple sketch.
Definition: tuple_sketch.hpp:217
void update(int16_t key, FwdUpdate &&value)
Update this sketch with a given signed 16-bit integer.
void trim()
Remove retained entries in excess of the nominal size k (if any)
Definition: tuple_sketch_impl.hpp:227
void update(uint64_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 64-bit integer.
virtual uint64_t get_theta64() const
Definition: tuple_sketch_impl.hpp:120
virtual uint32_t get_num_retained() const
Definition: tuple_sketch_impl.hpp:125
resize_factor get_rf() const
Definition: tuple_sketch_impl.hpp:140
void update(int32_t key, FwdUpdate &&value)
Update this sketch with a given signed 32-bit integer.
virtual bool is_empty() const
Definition: tuple_sketch_impl.hpp:110
virtual bool is_ordered() const
Definition: tuple_sketch_impl.hpp:115
void update(float key, FwdUpdate &&value)
Update this sketch with a given floating point value.
virtual uint16_t get_seed_hash() const
Definition: tuple_sketch_impl.hpp:130
virtual iterator end()
Iterator pointing past the valid range.
Definition: tuple_sketch_impl.hpp:242
void update(int8_t key, FwdUpdate &&value)
Update this sketch with a given signed 8-bit integer.
compact_tuple_sketch< Summary, Allocator > compact(bool ordered=true) const
Converts this sketch to a compact sketch (ordered or unordered).
Definition: tuple_sketch_impl.hpp:257
uint8_t get_lg_k() const
Definition: tuple_sketch_impl.hpp:135
void update(const std::string &key, FwdUpdate &&value)
Update this sketch with a given string.
void update(uint8_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 8-bit integer.
void update(uint32_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 32-bit integer.
virtual Allocator get_allocator() const
Definition: tuple_sketch_impl.hpp:105
void update(int64_t key, FwdUpdate &&value)
Update this sketch with a given signed 64-bit integer.
void reset()
Reset the sketch to the initial empty state.
Definition: tuple_sketch_impl.hpp:232
void update(const void *key, size_t length, FwdUpdate &&value)
Update this sketch with given data of any type.
virtual iterator begin()
Iterator over entries in this sketch.
Definition: tuple_sketch_impl.hpp:237
void update(double key, FwdUpdate &&value)
Update this sketch with a given double-precision floating point value.
void update(uint16_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 16-bit integer.
DataSketches namespace.
Definition: binomial_bounds.hpp:38
Interface for serializing and deserializing items.
Definition: serde.hpp:34