datasketches-cpp
Loading...
Searching...
No Matches
tuple_sketch.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef TUPLE_SKETCH_HPP_
21#define TUPLE_SKETCH_HPP_
22
23#include <string>
24
25#include "serde.hpp"
26#include "theta_update_sketch_base.hpp"
27
28namespace datasketches {
29
30// forward declarations
31template<typename S, typename A> class tuple_sketch;
32template<typename S, typename U, typename P, typename A> class update_tuple_sketch;
33template<typename S, typename A> class compact_tuple_sketch;
34template<typename A> class theta_sketch_alloc;
35
36template<typename K, typename V>
37struct pair_extract_key {
38 K& operator()(std::pair<K, V>& entry) const {
39 return entry.first;
40 }
41 const K& operator()(const std::pair<K, V>& entry) const {
42 return entry.first;
43 }
44};
45
50template<
51 typename Summary,
52 typename Allocator = std::allocator<Summary>
53>
55public:
56 using Entry = std::pair<uint64_t, Summary>;
57 using ExtractKey = pair_extract_key<uint64_t, Summary>;
58 using iterator = theta_iterator<Entry, ExtractKey>;
59 using const_iterator = theta_const_iterator<Entry, ExtractKey>;
60
61 virtual ~tuple_sketch() = default;
62
66 virtual Allocator get_allocator() const = 0;
67
71 virtual bool is_empty() const = 0;
72
76 double get_estimate() const;
77
87 double get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
88
96 double get_lower_bound(uint8_t num_std_devs) const;
97
98
108 double get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const ;
109
110
118 double get_upper_bound(uint8_t num_std_devs) const;
119
123 bool is_estimation_mode() const;
124
128 double get_theta() const;
129
133 virtual uint64_t get_theta64() const = 0;
134
138 virtual uint32_t get_num_retained() const = 0;
139
143 virtual uint16_t get_seed_hash() const = 0;
144
148 virtual bool is_ordered() const = 0;
149
155 string<Allocator> to_string(bool print_items = false) const;
156
161 virtual iterator begin() = 0;
162
168 virtual iterator end() = 0;
169
174 virtual const_iterator begin() const = 0;
175
181 virtual const_iterator end() const = 0;
182
183protected:
184 virtual void print_specifics(std::ostringstream& os) const = 0;
185
186 static uint16_t get_seed_hash(uint64_t seed);
187
188 static void check_sketch_type(uint8_t actual, uint8_t expected);
189 static void check_serial_version(uint8_t actual, uint8_t expected);
190 static void check_seed_hash(uint16_t actual, uint16_t expected);
191};
192
193// update sketch
194
195// for types with defined default constructor and + operation
196template<typename Summary, typename Update>
197struct default_tuple_update_policy {
198 Summary create() const {
199 return Summary();
200 }
201 void update(Summary& summary, const Update& update) const {
202 summary += update;
203 }
204};
205
211template<
212 typename Summary,
213 typename Update = Summary,
214 typename Policy = default_tuple_update_policy<Summary, Update>,
215 typename Allocator = std::allocator<Summary>
216>
217class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
218public:
220 using Entry = typename Base::Entry;
221 using ExtractKey = typename Base::ExtractKey;
222 using iterator = typename Base::iterator;
223 using const_iterator = typename Base::const_iterator;
224 using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
225 using tuple_map = theta_update_sketch_base<Entry, ExtractKey, AllocEntry>;
226 using resize_factor = typename tuple_map::resize_factor;
227
228 // No constructor here. Use builder instead.
229 class builder;
230
232 update_tuple_sketch(update_tuple_sketch&&) noexcept = default;
233 virtual ~update_tuple_sketch() = default;
234 update_tuple_sketch& operator=(const update_tuple_sketch&) = default;
235 update_tuple_sketch& operator=(update_tuple_sketch&&) = default;
236
237 virtual Allocator get_allocator() const;
238 virtual bool is_empty() const;
239 virtual bool is_ordered() const;
240 virtual uint64_t get_theta64() const;
241 virtual uint32_t get_num_retained() const;
242 virtual uint16_t get_seed_hash() const;
243
247 uint8_t get_lg_k() const;
248
252 resize_factor get_rf() const;
253
259 template<typename FwdUpdate>
260 inline void update(const std::string& key, FwdUpdate&& value);
261
267 template<typename FwdUpdate>
268 inline void update(uint64_t key, FwdUpdate&& value);
269
275 template<typename FwdUpdate>
276 inline void update(int64_t key, FwdUpdate&& value);
277
284 template<typename FwdUpdate>
285 inline void update(uint32_t key, FwdUpdate&& value);
286
293 template<typename FwdUpdate>
294 inline void update(int32_t key, FwdUpdate&& value);
295
302 template<typename FwdUpdate>
303 inline void update(uint16_t key, FwdUpdate&& value);
304
311 template<typename FwdUpdate>
312 inline void update(int16_t key, FwdUpdate&& value);
313
320 template<typename FwdUpdate>
321 inline void update(uint8_t key, FwdUpdate&& value);
322
329 template<typename FwdUpdate>
330 inline void update(int8_t key, FwdUpdate&& value);
331
338 template<typename FwdUpdate>
339 inline void update(double key, FwdUpdate&& value);
340
347 template<typename FwdUpdate>
348 inline void update(float key, FwdUpdate&& value);
349
364 template<typename FwdUpdate>
365 void update(const void* key, size_t length, FwdUpdate&& value);
366
370 void trim();
371
375 void reset();
376
382 compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
383
390 template<typename Predicate>
391 compact_tuple_sketch<Summary, Allocator> filter(const Predicate& predicate) const;
392
393 virtual iterator begin();
394 virtual iterator end();
395 virtual const_iterator begin() const;
396 virtual const_iterator end() const;
397
398protected:
399 Policy policy_;
400 tuple_map map_;
401
402 // for builder
403 update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
404
405 virtual void print_specifics(std::ostringstream& os) const;
406};
407
412template<
413 typename Summary,
414 typename Allocator = std::allocator<Summary>
415>
416class compact_tuple_sketch: public tuple_sketch<Summary, Allocator> {
417public:
419 using Entry = typename Base::Entry;
420 using ExtractKey = typename Base::ExtractKey;
421 using iterator = typename Base::iterator;
422 using const_iterator = typename Base::const_iterator;
423 using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
424 using AllocU64 = typename std::allocator_traits<Allocator>::template rebind_alloc<uint64_t>;
425 using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
426 using vector_bytes = std::vector<uint8_t, AllocBytes>;
427 using comparator = compare_by_key<ExtractKey>;
428
429 static const uint8_t SERIAL_VERSION_LEGACY = 1;
430 static const uint8_t SERIAL_VERSION = 3;
431 static const uint8_t SKETCH_FAMILY = 9;
432 static const uint8_t SKETCH_TYPE = 1;
433 static const uint8_t SKETCH_TYPE_LEGACY = 5;
434 enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
435
436 // Instances of this type can be obtained:
437 // - by compacting an update_tuple_sketch
438 // - as a result of a set operation
439 // - by deserializing a previously serialized compact sketch
440
447 compact_tuple_sketch(const Base& other, bool ordered);
448
454
460
461 virtual ~compact_tuple_sketch() = default;
462
468 compact_tuple_sketch& operator=(const compact_tuple_sketch& other) = default;
469
475 compact_tuple_sketch& operator=(compact_tuple_sketch&& other) = default;
476
483 compact_tuple_sketch(const theta_sketch_alloc<AllocU64>& other, const Summary& summary, bool ordered = true);
484
485 virtual Allocator get_allocator() const;
486 virtual bool is_empty() const;
487 virtual bool is_ordered() const;
488 virtual uint64_t get_theta64() const;
489 virtual uint32_t get_num_retained() const;
490 virtual uint16_t get_seed_hash() const;
491
498 template<typename Predicate>
499 compact_tuple_sketch filter(const Predicate& predicate) const;
500
508 template<typename Sketch, typename Predicate>
509 static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& predicate);
510
516 template<typename SerDe = serde<Summary>>
517 void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
518
528 template<typename SerDe = serde<Summary>>
529 vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
530
531 virtual iterator begin();
532 virtual iterator end();
533 virtual const_iterator begin() const;
534 virtual const_iterator end() const;
535
544 template<typename SerDe = serde<Summary>>
545 static compact_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED,
546 const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
547
557 template<typename SerDe = serde<Summary>>
558 static compact_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED,
559 const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
560
561protected:
562 bool is_empty_;
563 bool is_ordered_;
564 uint16_t seed_hash_;
565 uint64_t theta_;
566 std::vector<Entry, AllocEntry> entries_;
567
573 template<typename SerDe, typename SS = Summary, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type = 0>
574 size_t get_serialized_size_summaries_bytes(const SerDe& sd) const;
575
581 template<typename SerDe, typename SS = Summary, typename std::enable_if<!std::is_arithmetic<SS>::value, int>::type = 0>
582 size_t get_serialized_size_summaries_bytes(const SerDe& sd) const;
583
584 // for deserialize
585 class deleter_of_summaries {
586 public:
587 deleter_of_summaries(uint32_t num, bool destroy, const Allocator& allocator):
588 allocator_(allocator), num_(num), destroy_(destroy) {}
589 void set_destroy(bool destroy) { destroy_ = destroy; }
590 void operator() (Summary* ptr) {
591 if (ptr != nullptr) {
592 if (destroy_) {
593 for (uint32_t i = 0; i < num_; ++i) ptr[i].~Summary();
594 }
595 allocator_.deallocate(ptr, num_);
596 }
597 }
598 private:
599 Allocator allocator_;
600 uint32_t num_;
601 bool destroy_;
602 };
603
604 virtual void print_specifics(std::ostringstream& os) const;
605
606 template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_union_base;
607 template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
608 template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
609 compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
610};
611
613template<typename Derived, typename Policy, typename Allocator>
614class tuple_base_builder: public theta_base_builder<Derived, Allocator> {
615public:
616 tuple_base_builder(const Policy& policy, const Allocator& allocator);
617
618protected:
619 Policy policy_;
620};
621
623template<typename S, typename U, typename P, typename A>
624class update_tuple_sketch<S, U, P, A>::builder: public tuple_base_builder<builder, P, A> {
625public:
632 builder(const P& policy = P(), const A& allocator = A());
633
639};
640
641} /* namespace datasketches */
642
643#include "tuple_sketch_impl.hpp"
644
645#endif
Compact Tuple sketch.
Definition tuple_sketch.hpp:416
virtual uint64_t get_theta64() const
Definition tuple_sketch_impl.hpp:339
virtual uint32_t get_num_retained() const
Definition tuple_sketch_impl.hpp:344
compact_tuple_sketch filter(const Predicate &predicate) const
Produces a Compact Tuple sketch from this sketch by applying a given predicate to each entry.
void serialize(std::ostream &os, const SerDe &sd=SerDe()) const
This method serializes the sketch into a given stream in a binary form.
Definition tuple_sketch_impl.hpp:401
virtual bool is_empty() const
Definition tuple_sketch_impl.hpp:329
virtual bool is_ordered() const
Definition tuple_sketch_impl.hpp:334
virtual uint16_t get_seed_hash() const
Definition tuple_sketch_impl.hpp:349
virtual iterator end()
Iterator pointing past the valid range.
Definition tuple_sketch_impl.hpp:602
virtual Allocator get_allocator() const
Definition tuple_sketch_impl.hpp:324
static compact_tuple_sketch deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const SerDe &sd=SerDe(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
virtual iterator begin()
Iterator over entries in this sketch.
Definition tuple_sketch_impl.hpp:597
compact_tuple_sketch(const compact_tuple_sketch &other)=default
Copy constructor.
size_t get_serialized_size_summaries_bytes(const SerDe &sd) const
Computes size needed to serialize summaries in the sketch.
Theta base builder.
Definition theta_update_sketch_base.hpp:97
Base class for the Theta Sketch, a generalization of the Kth Minimum Value (KMV) sketch.
Definition theta_sketch.hpp:127
Tuple base builder.
Definition tuple_sketch.hpp:614
Base class for Tuple sketch.
Definition tuple_sketch.hpp:54
double get_upper_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const
Returns the approximate upper error bound given a number of standard deviations over an arbitrary num...
Definition tuple_sketch_impl.hpp:57
virtual const_iterator begin() const =0
Const iterator over entries in this sketch.
double get_estimate() const
Definition tuple_sketch_impl.hpp:40
virtual bool is_ordered() const =0
virtual const_iterator end() const =0
Const iterator pointing past the valid range.
virtual bool is_empty() const =0
double get_lower_bound(uint8_t num_std_devs, uint32_t num_subset_entries) const
Returns the approximate lower error bound given a number of standard deviations over an arbitrary num...
Definition tuple_sketch_impl.hpp:45
string< Allocator > to_string(bool print_items=false) const
Provides a human-readable summary of this sketch as a string.
Definition tuple_sketch_impl.hpp:69
virtual uint32_t get_num_retained() const =0
double get_theta() const
Definition tuple_sketch_impl.hpp:34
virtual iterator end()=0
Iterator pointing past the valid range.
virtual uint16_t get_seed_hash() const =0
virtual iterator begin()=0
Iterator over entries in this sketch.
bool is_estimation_mode() const
Definition tuple_sketch_impl.hpp:29
virtual Allocator get_allocator() const =0
virtual uint64_t get_theta64() const =0
Update Tuple sketch builder.
Definition tuple_sketch.hpp:624
Update Tuple sketch.
Definition tuple_sketch.hpp:217
void update(int16_t key, FwdUpdate &&value)
Update this sketch with a given signed 16-bit integer.
compact_tuple_sketch< Summary, Allocator > filter(const Predicate &predicate) const
Produces a Compact Tuple sketch from this sketch by applying a given predicate to each entry.
void trim()
Remove retained entries in excess of the nominal size k (if any)
Definition tuple_sketch_impl.hpp:227
void update(uint64_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 64-bit integer.
virtual uint64_t get_theta64() const
Definition tuple_sketch_impl.hpp:120
virtual uint32_t get_num_retained() const
Definition tuple_sketch_impl.hpp:125
resize_factor get_rf() const
Definition tuple_sketch_impl.hpp:140
void update(int32_t key, FwdUpdate &&value)
Update this sketch with a given signed 32-bit integer.
virtual bool is_empty() const
Definition tuple_sketch_impl.hpp:110
virtual bool is_ordered() const
Definition tuple_sketch_impl.hpp:115
void update(float key, FwdUpdate &&value)
Update this sketch with a given floating point value.
virtual uint16_t get_seed_hash() const
Definition tuple_sketch_impl.hpp:130
virtual iterator end()
Iterator pointing past the valid range.
Definition tuple_sketch_impl.hpp:242
void update(int8_t key, FwdUpdate &&value)
Update this sketch with a given signed 8-bit integer.
compact_tuple_sketch< Summary, Allocator > compact(bool ordered=true) const
Converts this sketch to a compact sketch (ordered or unordered).
Definition tuple_sketch_impl.hpp:257
uint8_t get_lg_k() const
Definition tuple_sketch_impl.hpp:135
void update(const std::string &key, FwdUpdate &&value)
Update this sketch with a given string.
void update(uint8_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 8-bit integer.
void update(uint32_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 32-bit integer.
virtual Allocator get_allocator() const
Definition tuple_sketch_impl.hpp:105
void update(int64_t key, FwdUpdate &&value)
Update this sketch with a given signed 64-bit integer.
void reset()
Reset the sketch to the initial empty state.
Definition tuple_sketch_impl.hpp:232
void update(const void *key, size_t length, FwdUpdate &&value)
Update this sketch with given data of any type.
virtual iterator begin()
Iterator over entries in this sketch.
Definition tuple_sketch_impl.hpp:237
void update(double key, FwdUpdate &&value)
Update this sketch with a given double-precision floating point value.
void update(uint16_t key, FwdUpdate &&value)
Update this sketch with a given unsigned 16-bit integer.
DataSketches namespace.
Definition binomial_bounds.hpp:38
Interface for serializing and deserializing items.
Definition serde.hpp:34