datasketches-cpp
tuple_sketch.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef TUPLE_SKETCH_HPP_
21 #define TUPLE_SKETCH_HPP_
22 
23 #include <string>
24 
25 #include "serde.hpp"
26 #include "theta_update_sketch_base.hpp"
27 
28 namespace datasketches {
29 
30 // forward declarations
31 template<typename S, typename A> class tuple_sketch;
32 template<typename S, typename U, typename P, typename A> class update_tuple_sketch;
33 template<typename S, typename A> class compact_tuple_sketch;
34 template<typename A> class theta_sketch_alloc;
35 
36 template<typename K, typename V>
37 struct pair_extract_key {
38  K& operator()(std::pair<K, V>& entry) const {
39  return entry.first;
40  }
41  const K& operator()(const std::pair<K, V>& entry) const {
42  return entry.first;
43  }
44 };
45 
50 template<
51  typename Summary,
52  typename Allocator = std::allocator<Summary>
53 >
54 class tuple_sketch {
55 public:
56  using Entry = std::pair<uint64_t, Summary>;
57  using ExtractKey = pair_extract_key<uint64_t, Summary>;
58  using iterator = theta_iterator<Entry, ExtractKey>;
59  using const_iterator = theta_const_iterator<Entry, ExtractKey>;
60 
61  virtual ~tuple_sketch() = default;
62 
66  virtual Allocator get_allocator() const = 0;
67 
71  virtual bool is_empty() const = 0;
72 
76  double get_estimate() const;
77 
87  double get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
88 
96  double get_lower_bound(uint8_t num_std_devs) const;
97 
98 
108  double get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
109 
110 
118  double get_upper_bound(uint8_t num_std_devs) const;
119 
123  bool is_estimation_mode() const;
124 
128  double get_theta() const;
129 
133  virtual uint64_t get_theta64() const = 0;
134 
138  virtual uint32_t get_num_retained() const = 0;
139 
143  virtual uint16_t get_seed_hash() const = 0;
144 
148  virtual bool is_ordered() const = 0;
149 
155  string<Allocator> to_string(bool print_items = false) const;
156 
161  virtual iterator begin() = 0;
162 
168  virtual iterator end() = 0;
169 
174  virtual const_iterator begin() const = 0;
175 
181  virtual const_iterator end() const = 0;
182 
183 protected:
184  virtual void print_specifics(std::ostringstream& os) const = 0;
185 
186  static uint16_t get_seed_hash(uint64_t seed);
187 
188  static void check_sketch_type(uint8_t actual, uint8_t expected);
189  static void check_serial_version(uint8_t actual, uint8_t expected);
190  static void check_seed_hash(uint16_t actual, uint16_t expected);
191 };
192 
193 // update sketch
194 
195 // for types with defined default constructor and + operation
196 template<typename Summary, typename Update>
197 struct default_tuple_update_policy {
198  Summary create() const {
199  return Summary();
200  }
201  void update(Summary& summary, const Update& update) const {
202  summary += update;
203  }
204 };
205 
211 template<
212  typename Summary,
213  typename Update = Summary,
214  typename Policy = default_tuple_update_policy<Summary, Update>,
215  typename Allocator = std::allocator<Summary>
216 >
217 class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
218 public:
220  using Entry = typename Base::Entry;
221  using ExtractKey = typename Base::ExtractKey;
222  using iterator = typename Base::iterator;
223  using const_iterator = typename Base::const_iterator;
224  using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
225  using tuple_map = theta_update_sketch_base<Entry, ExtractKey, AllocEntry>;
226  using resize_factor = typename tuple_map::resize_factor;
227 
228  // No constructor here. Use builder instead.
229  class builder;
230 
231  update_tuple_sketch(const update_tuple_sketch&) = default;
232  update_tuple_sketch(update_tuple_sketch&&) noexcept = default;
233  virtual ~update_tuple_sketch() = default;
234  update_tuple_sketch& operator=(const update_tuple_sketch&) = default;
235  update_tuple_sketch& operator=(update_tuple_sketch&&) = default;
236 
237  virtual Allocator get_allocator() const;
238  virtual bool is_empty() const;
239  virtual bool is_ordered() const;
240  virtual uint64_t get_theta64() const;
241  virtual uint32_t get_num_retained() const;
242  virtual uint16_t get_seed_hash() const;
243 
247  uint8_t get_lg_k() const;
248 
252  resize_factor get_rf() const;
253 
259  template<typename FwdUpdate>
260  inline void update(const std::string& key, FwdUpdate&& value);
261 
267  template<typename FwdUpdate>
268  inline void update(uint64_t key, FwdUpdate&& value);
269 
275  template<typename FwdUpdate>
276  inline void update(int64_t key, FwdUpdate&& value);
277 
284  template<typename FwdUpdate>
285  inline void update(uint32_t key, FwdUpdate&& value);
286 
293  template<typename FwdUpdate>
294  inline void update(int32_t key, FwdUpdate&& value);
295 
302  template<typename FwdUpdate>
303  inline void update(uint16_t key, FwdUpdate&& value);
304 
311  template<typename FwdUpdate>
312  inline void update(int16_t key, FwdUpdate&& value);
313 
320  template<typename FwdUpdate>
321  inline void update(uint8_t key, FwdUpdate&& value);
322 
329  template<typename FwdUpdate>
330  inline void update(int8_t key, FwdUpdate&& value);
331 
338  template<typename FwdUpdate>
339  inline void update(double key, FwdUpdate&& value);
340 
347  template<typename FwdUpdate>
348  inline void update(float key, FwdUpdate&& value);
349 
364  template<typename FwdUpdate>
365  void update(const void* key, size_t length, FwdUpdate&& value);
366 
370  void trim();
371 
375  void reset();
376 
382  compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
383 
390  template<typename Predicate>
391  compact_tuple_sketch<Summary, Allocator> filter(const Predicate& predicate) const;
392 
393  virtual iterator begin();
394  virtual iterator end();
395  virtual const_iterator begin() const;
396  virtual const_iterator end() const;
397 
398 protected:
399  Policy policy_;
400  tuple_map map_;
401 
402  // for builder
403  update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
404 
405  virtual void print_specifics(std::ostringstream& os) const;
406 };
407 
412 template<
413  typename Summary,
414  typename Allocator = std::allocator<Summary>
415 >
416 class compact_tuple_sketch: public tuple_sketch<Summary, Allocator> {
417 public:
419  using Entry = typename Base::Entry;
420  using ExtractKey = typename Base::ExtractKey;
421  using iterator = typename Base::iterator;
422  using const_iterator = typename Base::const_iterator;
423  using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
424  using AllocU64 = typename std::allocator_traits<Allocator>::template rebind_alloc<uint64_t>;
425  using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
426  using vector_bytes = std::vector<uint8_t, AllocBytes>;
427  using comparator = compare_by_key<ExtractKey>;
428 
429  static const uint8_t SERIAL_VERSION_LEGACY = 1;
430  static const uint8_t SERIAL_VERSION = 3;
431  static const uint8_t SKETCH_FAMILY = 9;
432  static const uint8_t SKETCH_TYPE = 1;
433  static const uint8_t SKETCH_TYPE_LEGACY = 5;
434  enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
435 
436  // Instances of this type can be obtained:
437  // - by compacting an update_tuple_sketch
438  // - as a result of a set operation
439  // - by deserializing a previously serialized compact sketch
440 
447  compact_tuple_sketch(const Base& other, bool ordered);
448 
454 
460 
461  virtual ~compact_tuple_sketch() = default;
462 
468  compact_tuple_sketch& operator=(const compact_tuple_sketch& other) = default;
469 
475  compact_tuple_sketch& operator=(compact_tuple_sketch&& other) = default;
476 
483  compact_tuple_sketch(const theta_sketch_alloc<AllocU64>& other, const Summary& summary, bool ordered = true);
484 
485  virtual Allocator get_allocator() const;
486  virtual bool is_empty() const;
487  virtual bool is_ordered() const;
488  virtual uint64_t get_theta64() const;
489  virtual uint32_t get_num_retained() const;
490  virtual uint16_t get_seed_hash() const;
491 
498  template<typename Predicate>
499  compact_tuple_sketch filter(const Predicate& predicate) const;
500 
508  template<typename Sketch, typename Predicate>
509  static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& predicate);
510 
516  template<typename SerDe = serde<Summary>>
517  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
518 
528  template<typename SerDe = serde<Summary>>
529  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
530 
531  virtual iterator begin();
532  virtual iterator end();
533  virtual const_iterator begin() const;
534  virtual const_iterator end() const;
535 
544  template<typename SerDe = serde<Summary>>
545  static compact_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED,
546  const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
547 
557  template<typename SerDe = serde<Summary>>
558  static compact_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED,
559  const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
560 
561 protected:
562  bool is_empty_;
563  bool is_ordered_;
564  uint16_t seed_hash_;
565  uint64_t theta_;
566  std::vector<Entry, AllocEntry> entries_;
567 
573  template<typename SerDe, typename SS = Summary, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type = 0>
574  size_t get_serialized_size_summaries_bytes(const SerDe& sd) const;
575 
581  template<typename SerDe, typename SS = Summary, typename std::enable_if<!std::is_arithmetic<SS>::value, int>::type = 0>
582  size_t get_serialized_size_summaries_bytes(const SerDe& sd) const;
583 
584  // for deserialize
585  class deleter_of_summaries {
586  public:
587  deleter_of_summaries(uint32_t num, bool destroy, const Allocator& allocator):
588  allocator_(allocator), num_(num), destroy_(destroy) {}
589  void set_destroy(bool destroy) { destroy_ = destroy; }
590  void operator() (Summary* ptr) {
591  if (ptr != nullptr) {
592  if (destroy_) {
593  for (uint32_t i = 0; i < num_; ++i) ptr[i].~Summary();
594  }
595  allocator_.deallocate(ptr, num_);
596  }
597  }
598  private:
599  Allocator allocator_;
600  uint32_t num_;
601  bool destroy_;
602  };
603 
604  virtual void print_specifics(std::ostringstream& os) const;
605 
606  template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_union_base;
607  template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
608  template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
609  compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
610 };
611 
613 template<typename Derived, typename Policy, typename Allocator>
614 class tuple_base_builder: public theta_base_builder<Derived, Allocator> {
615 public:
616  tuple_base_builder(const Policy& policy, const Allocator& allocator);
617 
618 protected:
619  Policy policy_;
620 };
621 
623 template<typename S, typename U, typename P, typename A>
624 class update_tuple_sketch<S, U, P, A>::builder: public tuple_base_builder<builder, P, A> {
625 public:
632  builder(const P& policy = P(), const A& allocator = A());
633 
638  update_tuple_sketch<S, U, P, A> build() const;
639 };
640 
641 } /* namespace datasketches */
642 
643 #include "tuple_sketch_impl.hpp"
644 
645 #endif
Compact Tuple sketch.
Definition: tuple_sketch.hpp:416
virtual uint64_t get_theta64() const
Definition: tuple_sketch_impl.hpp:339
virtual uint32_t get_num_retained() const
Definition: tuple_sketch_impl.hpp:344
compact_tuple_sketch filter(const Predicate &predicate) const
Produces a Compact Tuple sketch from this sketch by applying a given predicate to each entry.
void serialize(std::ostream &os, const SerDe &sd=SerDe()) const
This method serializes the sketch into a given stream in a binary form.
Definition: tuple_sketch_impl.hpp:401
virtual bool is_empty() const
Definition: tuple_sketch_impl.hpp:329
virtual bool is_ordered() const
Definition: tuple_sketch_impl.hpp:334
virtual uint16_t get_seed_hash() const
Definition: tuple_sketch_impl.hpp:349
virtual iterator end()
Iterator pointing past the valid range.
Definition: tuple_sketch_impl.hpp:602
compact_tuple_sketch(const Base &other, bool ordered)
Copy constructor.
Definition: tuple_sketch_impl.hpp:287
virtual Allocator get_allocator() const
Definition: tuple_sketch_impl.hpp:324
static compact_tuple_sketch deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const SerDe &sd=SerDe(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
virtual iterator begin()
Iterator over entries in this sketch.
Definition: tuple_sketch_impl.hpp:597
compact_tuple_sketch(const compact_tuple_sketch &other)=default
Copy constructor.
size_t get_serialized_size_summaries_bytes(const SerDe &sd) const
Computes size needed to serialize summaries in the sketch.
Theta base builder.
Definition: theta_update_sketch_base.hpp:97
Base class for the Theta Sketch, a generalization of the Kth Minimum Value (KMV) sketch.
Definition: theta_sketch.hpp:127
Tuple base builder.
Definition: tuple_sketch.hpp:614
Base class for Tuple sketch.
Definition: tuple_sketch.hpp:54
double get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const
Returns the approximate upper error bound given a number of standard deviations over an arbitrary num...
Definition: tuple_sketch_impl.hpp:57
virtual const_iterator begin() const =0
Const iterator over entries in this sketch.
double get_estimate() const
Definition: tuple_sketch_impl.hpp:40
virtual bool is_ordered() const =0
virtual const_iterator end() const =0
Const iterator pointing past the valid range.
virtual bool is_empty() const =0
double get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const
Returns the approximate lower error bound given a number of standard deviations over an arbitrary num...
Definition: tuple_sketch_impl.hpp:45
string< Allocator > to_string(bool print_items=false) const
Provides a human-readable summary of this sketch as a string.
Definition: tuple_sketch_impl.hpp:69
virtual uint32_t get_num_retained() const =0
double get_theta() const
Definition: tuple_sketch_impl.hpp:34
virtual iterator end()=0
Iterator pointing past the valid range.
virtual uint16_t get_seed_hash() const =0
virtual iterator begin()=0
Iterator over entries in this sketch.
bool is_estimation_mode() const
Definition: tuple_sketch_impl.hpp:29
virtual Allocator get_allocator() const =0
virtual uint64_t get_theta64() const =0
Update Tuple sketch builder.
Definition: tuple_sketch.hpp:624
Update Tuple sketch.
Definition: tuple_sketch.hpp:217
void update(int16_t key, FwdUpdate &&value)
Update this sketch with a given signed 16-bit integer.
void trim()
Remove retained entries in excess of the nominal size k (if any)
Definition: tuple_sketch_impl.hpp:227
void update(uint64_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 64-bit integer.
virtual uint64_t get_theta64() const
Definition: tuple_sketch_impl.hpp:120
virtual uint32_t get_num_retained() const
Definition: tuple_sketch_impl.hpp:125
resize_factor get_rf() const
Definition: tuple_sketch_impl.hpp:140
void update(int32_t key, FwdUpdate &&value)
Update this sketch with a given signed 32-bit integer.
virtual bool is_empty() const
Definition: tuple_sketch_impl.hpp:110
virtual bool is_ordered() const
Definition: tuple_sketch_impl.hpp:115
void update(float key, FwdUpdate &&value)
Update this sketch with a given floating point value.
virtual uint16_t get_seed_hash() const
Definition: tuple_sketch_impl.hpp:130
virtual iterator end()
Iterator pointing past the valid range.
Definition: tuple_sketch_impl.hpp:242
void update(int8_t key, FwdUpdate &&value)
Update this sketch with a given signed 8-bit integer.
compact_tuple_sketch< Summary, Allocator > compact(bool ordered=true) const
Converts this sketch to a compact sketch (ordered or unordered).
Definition: tuple_sketch_impl.hpp:257
uint8_t get_lg_k() const
Definition: tuple_sketch_impl.hpp:135
void update(const std::string &key, FwdUpdate &&value)
Update this sketch with a given string.
void update(uint8_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 8-bit integer.
void update(uint32_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 32-bit integer.
virtual Allocator get_allocator() const
Definition: tuple_sketch_impl.hpp:105
compact_tuple_sketch< Summary, Allocator > filter(const Predicate &predicate) const
Produces a Compact Tuple sketch from this sketch by applying a given predicate to each entry.
void update(int64_t key, FwdUpdate &&value)
Update this sketch with a given signed 64-bit integer.
void reset()
Reset the sketch to the initial empty state.
Definition: tuple_sketch_impl.hpp:232
void update(const void *key, size_t length, FwdUpdate &&value)
Update this sketch with given data of any type.
virtual iterator begin()
Iterator over entries in this sketch.
Definition: tuple_sketch_impl.hpp:237
void update(double key, FwdUpdate &&value)
Update this sketch with a given double-precision floating point value.
void update(uint16_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 16-bit integer.
DataSketches namespace.
Definition: binomial_bounds.hpp:38
Interface for serializing and deserializing items.
Definition: serde.hpp:34