datasketches-cpp
Loading...
Searching...
No Matches
tuple_sketch.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef TUPLE_SKETCH_HPP_
21#define TUPLE_SKETCH_HPP_
22
23#include <string>
24
25#include "serde.hpp"
26#include "theta_update_sketch_base.hpp"
27
28namespace datasketches {
29
30// forward declarations
31template<typename S, typename A> class tuple_sketch;
32template<typename S, typename U, typename P, typename A> class update_tuple_sketch;
33template<typename S, typename A> class compact_tuple_sketch;
34template<typename A> class theta_sketch_alloc;
35
36template<typename K, typename V>
37struct pair_extract_key {
38 K& operator()(std::pair<K, V>& entry) const {
39 return entry.first;
40 }
41 const K& operator()(const std::pair<K, V>& entry) const {
42 return entry.first;
43 }
44};
45
55template<
56 typename Summary,
57 typename Allocator = std::allocator<Summary>
58>
60public:
61 using Entry = std::pair<uint64_t, Summary>;
62 using ExtractKey = pair_extract_key<uint64_t, Summary>;
63 using iterator = theta_iterator<Entry, ExtractKey>;
64 using const_iterator = theta_const_iterator<Entry, ExtractKey>;
65
66 virtual ~tuple_sketch() = default;
67
71 virtual Allocator get_allocator() const = 0;
72
76 virtual bool is_empty() const = 0;
77
81 double get_estimate() const;
82
92 double get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
93
101 double get_lower_bound(uint8_t num_std_devs) const;
102
103
113 double get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
114
115
123 double get_upper_bound(uint8_t num_std_devs) const;
124
128 bool is_estimation_mode() const;
129
133 double get_theta() const;
134
138 virtual uint64_t get_theta64() const = 0;
139
143 virtual uint32_t get_num_retained() const = 0;
144
148 virtual uint16_t get_seed_hash() const = 0;
149
153 virtual bool is_ordered() const = 0;
154
160 string<Allocator> to_string(bool print_items = false) const;
161
166 virtual iterator begin() = 0;
167
173 virtual iterator end() = 0;
174
179 virtual const_iterator begin() const = 0;
180
186 virtual const_iterator end() const = 0;
187
188protected:
189 virtual void print_specifics(std::ostringstream& os) const = 0;
190
191 static uint16_t get_seed_hash(uint64_t seed);
192
193 static void check_sketch_type(uint8_t actual, uint8_t expected);
194 static void check_serial_version(uint8_t actual, uint8_t expected);
195 static void check_seed_hash(uint16_t actual, uint16_t expected);
196};
197
198// update sketch
199
200// for types with defined default constructor and + operation
201template<typename Summary, typename Update>
202struct default_tuple_update_policy {
203 Summary create() const {
204 return Summary();
205 }
206 void update(Summary& summary, const Update& update) const {
207 summary += update;
208 }
209};
210
216template<
217 typename Summary,
218 typename Update = Summary,
219 typename Policy = default_tuple_update_policy<Summary, Update>,
220 typename Allocator = std::allocator<Summary>
221>
222class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
223public:
225 using Entry = typename Base::Entry;
226 using ExtractKey = typename Base::ExtractKey;
227 using iterator = typename Base::iterator;
228 using const_iterator = typename Base::const_iterator;
229 using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
230 using tuple_map = theta_update_sketch_base<Entry, ExtractKey, AllocEntry>;
231 using resize_factor = typename tuple_map::resize_factor;
232
233 // No constructor here. Use builder instead.
234 class builder;
235
237 update_tuple_sketch(update_tuple_sketch&&) noexcept = default;
238 virtual ~update_tuple_sketch() = default;
239 update_tuple_sketch& operator=(const update_tuple_sketch&) = default;
240 update_tuple_sketch& operator=(update_tuple_sketch&&) = default;
241
242 virtual Allocator get_allocator() const;
243 virtual bool is_empty() const;
244 virtual bool is_ordered() const;
245 virtual uint64_t get_theta64() const;
246 virtual uint32_t get_num_retained() const;
247 virtual uint16_t get_seed_hash() const;
248
252 uint8_t get_lg_k() const;
253
257 resize_factor get_rf() const;
258
267 template<typename FwdUpdate>
268 inline void update(const std::string& key, FwdUpdate&& value);
269
278 template<typename FwdUpdate>
279 inline void update(uint64_t key, FwdUpdate&& value);
280
289 template<typename FwdUpdate>
290 inline void update(int64_t key, FwdUpdate&& value);
291
301 template<typename FwdUpdate>
302 inline void update(uint32_t key, FwdUpdate&& value);
303
313 template<typename FwdUpdate>
314 inline void update(int32_t key, FwdUpdate&& value);
315
325 template<typename FwdUpdate>
326 inline void update(uint16_t key, FwdUpdate&& value);
327
337 template<typename FwdUpdate>
338 inline void update(int16_t key, FwdUpdate&& value);
339
349 template<typename FwdUpdate>
350 inline void update(uint8_t key, FwdUpdate&& value);
351
361 template<typename FwdUpdate>
362 inline void update(int8_t key, FwdUpdate&& value);
363
373 template<typename FwdUpdate>
374 inline void update(double key, FwdUpdate&& value);
375
385 template<typename FwdUpdate>
386 inline void update(float key, FwdUpdate&& value);
387
405 template<typename FwdUpdate>
406 void update(const void* key, size_t length, FwdUpdate&& value);
407
411 void trim();
412
416 void reset();
417
423 compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
424
431 template<typename Predicate>
432 compact_tuple_sketch<Summary, Allocator> filter(const Predicate& predicate) const;
433
434 virtual iterator begin();
435 virtual iterator end();
436 virtual const_iterator begin() const;
437 virtual const_iterator end() const;
438
439protected:
440 Policy policy_;
441 tuple_map map_;
442
443 // for builder
444 update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
445
446 virtual void print_specifics(std::ostringstream& os) const;
447};
448
453template<
454 typename Summary,
455 typename Allocator = std::allocator<Summary>
456>
457class compact_tuple_sketch: public tuple_sketch<Summary, Allocator> {
458public:
460 using Entry = typename Base::Entry;
461 using ExtractKey = typename Base::ExtractKey;
462 using iterator = typename Base::iterator;
463 using const_iterator = typename Base::const_iterator;
464 using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
465 using AllocU64 = typename std::allocator_traits<Allocator>::template rebind_alloc<uint64_t>;
466 using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
467 using vector_bytes = std::vector<uint8_t, AllocBytes>;
468 using comparator = compare_by_key<ExtractKey>;
469
470 static const uint8_t SERIAL_VERSION_LEGACY = 1;
471 static const uint8_t SERIAL_VERSION = 3;
472 static const uint8_t SKETCH_FAMILY = 9;
473 static const uint8_t SKETCH_TYPE = 1;
474 static const uint8_t SKETCH_TYPE_LEGACY = 5;
475 enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
476
477 // Instances of this type can be obtained:
478 // - by compacting an update_tuple_sketch
479 // - as a result of a set operation
480 // - by deserializing a previously serialized compact sketch
481
488 compact_tuple_sketch(const Base& other, bool ordered);
489
495
501
502 virtual ~compact_tuple_sketch() = default;
503
509 compact_tuple_sketch& operator=(const compact_tuple_sketch& other) = default;
510
516 compact_tuple_sketch& operator=(compact_tuple_sketch&& other) = default;
517
524 compact_tuple_sketch(const theta_sketch_alloc<AllocU64>& other, const Summary& summary, bool ordered = true);
525
526 virtual Allocator get_allocator() const;
527 virtual bool is_empty() const;
528 virtual bool is_ordered() const;
529 virtual uint64_t get_theta64() const;
530 virtual uint32_t get_num_retained() const;
531 virtual uint16_t get_seed_hash() const;
532
539 template<typename Predicate>
540 compact_tuple_sketch filter(const Predicate& predicate) const;
541
549 template<typename Sketch, typename Predicate>
550 static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& predicate);
551
557 template<typename SerDe = serde<Summary>>
558 void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
559
569 template<typename SerDe = serde<Summary>>
570 vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
571
572 virtual iterator begin();
573 virtual iterator end();
574 virtual const_iterator begin() const;
575 virtual const_iterator end() const;
576
585 template<typename SerDe = serde<Summary>>
586 static compact_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED,
587 const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
588
598 template<typename SerDe = serde<Summary>>
599 static compact_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED,
600 const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
601
602protected:
603 bool is_empty_;
604 bool is_ordered_;
605 uint16_t seed_hash_;
606 uint64_t theta_;
607 std::vector<Entry, AllocEntry> entries_;
608
614 template<typename SerDe, typename SS = Summary, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type = 0>
615 size_t get_serialized_size_summaries_bytes(const SerDe& sd) const;
616
622 template<typename SerDe, typename SS = Summary, typename std::enable_if<!std::is_arithmetic<SS>::value, int>::type = 0>
623 size_t get_serialized_size_summaries_bytes(const SerDe& sd) const;
624
625 // for deserialize
626 class deleter_of_summaries {
627 public:
628 deleter_of_summaries(uint32_t num, bool destroy, const Allocator& allocator):
629 allocator_(allocator), num_(num), destroy_(destroy) {}
630 void set_destroy(bool destroy) { destroy_ = destroy; }
631 void operator() (Summary* ptr) {
632 if (ptr != nullptr) {
633 if (destroy_) {
634 for (uint32_t i = 0; i < num_; ++i) ptr[i].~Summary();
635 }
636 allocator_.deallocate(ptr, num_);
637 }
638 }
639 private:
640 Allocator allocator_;
641 uint32_t num_;
642 bool destroy_;
643 };
644
645 virtual void print_specifics(std::ostringstream& os) const;
646
647 template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_union_base;
648 template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
649 template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
650 compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
651};
652
654template<typename Derived, typename Policy, typename Allocator>
655class tuple_base_builder: public theta_base_builder<Derived, Allocator> {
656public:
657 tuple_base_builder(const Policy& policy, const Allocator& allocator);
658
659protected:
660 Policy policy_;
661};
662
664template<typename S, typename U, typename P, typename A>
665class update_tuple_sketch<S, U, P, A>::builder: public tuple_base_builder<builder, P, A> {
666public:
673 builder(const P& policy = P(), const A& allocator = A());
674
680};
681
682} /* namespace datasketches */
683
684#include "tuple_sketch_impl.hpp"
685
686#endif
Compact Tuple sketch.
Definition tuple_sketch.hpp:457
virtual uint64_t get_theta64() const
Definition tuple_sketch_impl.hpp:339
virtual uint32_t get_num_retained() const
Definition tuple_sketch_impl.hpp:344
compact_tuple_sketch filter(const Predicate &predicate) const
Produces a Compact Tuple sketch from this sketch by applying a given predicate to each entry.
void serialize(std::ostream &os, const SerDe &sd=SerDe()) const
This method serializes the sketch into a given stream in a binary form.
Definition tuple_sketch_impl.hpp:401
virtual bool is_empty() const
Definition tuple_sketch_impl.hpp:329
virtual bool is_ordered() const
Definition tuple_sketch_impl.hpp:334
virtual uint16_t get_seed_hash() const
Definition tuple_sketch_impl.hpp:349
virtual iterator end()
Iterator pointing past the valid range.
Definition tuple_sketch_impl.hpp:602
virtual Allocator get_allocator() const
Definition tuple_sketch_impl.hpp:324
static compact_tuple_sketch deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const SerDe &sd=SerDe(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
virtual iterator begin()
Iterator over entries in this sketch.
Definition tuple_sketch_impl.hpp:597
compact_tuple_sketch(const compact_tuple_sketch &other)=default
Copy constructor.
size_t get_serialized_size_summaries_bytes(const SerDe &sd) const
Computes size needed to serialize summaries in the sketch.
Theta base builder.
Definition theta_update_sketch_base.hpp:97
Base class for the Theta Sketch, a generalization of the Kth Minimum Value (KMV) sketch.
Definition theta_sketch.hpp:127
Tuple base builder.
Definition tuple_sketch.hpp:655
Base class for Tuple sketch.
Definition tuple_sketch.hpp:59
double get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const
Returns the approximate upper error bound given a number of standard deviations over an arbitrary num...
Definition tuple_sketch_impl.hpp:57
virtual const_iterator begin() const =0
Const iterator over entries in this sketch.
double get_estimate() const
Definition tuple_sketch_impl.hpp:40
virtual bool is_ordered() const =0
virtual const_iterator end() const =0
Const iterator pointing past the valid range.
virtual bool is_empty() const =0
double get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const
Returns the approximate lower error bound given a number of standard deviations over an arbitrary num...
Definition tuple_sketch_impl.hpp:45
string< Allocator > to_string(bool print_items=false) const
Provides a human-readable summary of this sketch as a string.
Definition tuple_sketch_impl.hpp:69
virtual uint32_t get_num_retained() const =0
double get_theta() const
Definition tuple_sketch_impl.hpp:34
virtual iterator end()=0
Iterator pointing past the valid range.
virtual uint16_t get_seed_hash() const =0
virtual iterator begin()=0
Iterator over entries in this sketch.
bool is_estimation_mode() const
Definition tuple_sketch_impl.hpp:29
virtual Allocator get_allocator() const =0
virtual uint64_t get_theta64() const =0
Update Tuple sketch builder.
Definition tuple_sketch.hpp:665
Update Tuple sketch.
Definition tuple_sketch.hpp:222
void update(int16_t key, FwdUpdate &&value)
Update this sketch with a given signed 16-bit integer.
compact_tuple_sketch< Summary, Allocator > filter(const Predicate &predicate) const
Produces a Compact Tuple sketch from this sketch by applying a given predicate to each entry.
void trim()
Remove retained entries in excess of the nominal size k (if any)
Definition tuple_sketch_impl.hpp:227
void update(uint64_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 64-bit integer.
virtual uint64_t get_theta64() const
Definition tuple_sketch_impl.hpp:120
virtual uint32_t get_num_retained() const
Definition tuple_sketch_impl.hpp:125
resize_factor get_rf() const
Definition tuple_sketch_impl.hpp:140
void update(int32_t key, FwdUpdate &&value)
Update this sketch with a given signed 32-bit integer.
virtual bool is_empty() const
Definition tuple_sketch_impl.hpp:110
virtual bool is_ordered() const
Definition tuple_sketch_impl.hpp:115
void update(float key, FwdUpdate &&value)
Update this sketch with a given floating point value.
virtual uint16_t get_seed_hash() const
Definition tuple_sketch_impl.hpp:130
virtual iterator end()
Iterator pointing past the valid range.
Definition tuple_sketch_impl.hpp:242
void update(int8_t key, FwdUpdate &&value)
Update this sketch with a given signed 8-bit integer.
compact_tuple_sketch< Summary, Allocator > compact(bool ordered=true) const
Converts this sketch to a compact sketch (ordered or unordered).
Definition tuple_sketch_impl.hpp:257
uint8_t get_lg_k() const
Definition tuple_sketch_impl.hpp:135
void update(const std::string &key, FwdUpdate &&value)
Update this sketch with a given string.
void update(uint8_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 8-bit integer.
void update(uint32_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 32-bit integer.
virtual Allocator get_allocator() const
Definition tuple_sketch_impl.hpp:105
void update(int64_t key, FwdUpdate &&value)
Update this sketch with a given signed 64-bit integer.
void reset()
Reset the sketch to the initial empty state.
Definition tuple_sketch_impl.hpp:232
void update(const void *key, size_t length, FwdUpdate &&value)
Update this sketch with given data of any type.
virtual iterator begin()
Iterator over entries in this sketch.
Definition tuple_sketch_impl.hpp:237
void update(double key, FwdUpdate &&value)
Update this sketch with a given double-precision floating point value.
void update(uint16_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 16-bit integer.
DataSketches namespace.
Definition binomial_bounds.hpp:38
Interface for serializing and deserializing items.
Definition serde.hpp:34