datasketches-cpp
Loading...
Searching...
No Matches
theta_sketch.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef THETA_SKETCH_HPP_
21#define THETA_SKETCH_HPP_
22
23#include "theta_update_sketch_base.hpp"
24#include "compact_theta_sketch_parser.hpp"
25
26namespace datasketches {
27
28// forward declarations
29template<typename A> class theta_sketch_alloc;
30template<typename A> class update_theta_sketch_alloc;
31template<typename A> class compact_theta_sketch_alloc;
32template<typename A> class wrapped_compact_theta_sketch_alloc;
33
42
44template<typename Allocator = std::allocator<uint64_t>>
46public:
47
48 virtual ~base_theta_sketch_alloc() = default;
49
53 virtual Allocator get_allocator() const = 0;
54
58 virtual bool is_empty() const = 0;
59
63 double get_estimate() const;
64
72 double get_lower_bound(uint8_t num_std_devs) const;
73
81 double get_upper_bound(uint8_t num_std_devs) const;
82
86 bool is_estimation_mode() const;
87
91 double get_theta() const;
92
96 virtual uint64_t get_theta64() const = 0;
97
101 virtual uint32_t get_num_retained() const = 0;
102
106 virtual uint16_t get_seed_hash() const = 0;
107
111 virtual bool is_ordered() const = 0;
112
118 virtual string<Allocator> to_string(bool print_items = false) const;
119
120protected:
121 virtual void print_specifics(std::ostringstream& os) const = 0;
122 virtual void print_items(std::ostringstream& os) const = 0;
123};
124
126template<typename Allocator = std::allocator<uint64_t>>
128public:
129 using Entry = uint64_t;
130 using ExtractKey = trivial_extract_key;
131 using iterator = theta_iterator<Entry, ExtractKey>;
132 using const_iterator = theta_const_iterator<Entry, ExtractKey>;
133
134 virtual ~theta_sketch_alloc() = default;
135
140 virtual iterator begin() = 0;
141
147 virtual iterator end() = 0;
148
153 virtual const_iterator begin() const = 0;
154
160 virtual const_iterator end() const = 0;
161
162protected:
163 virtual void print_items(std::ostringstream& os) const;
164};
165
166// forward declaration
167template<typename A> class compact_theta_sketch_alloc;
168
174template<typename Allocator = std::allocator<uint64_t>>
176public:
178 using Entry = typename Base::Entry;
179 using ExtractKey = typename Base::ExtractKey;
180 using iterator = typename Base::iterator;
181 using const_iterator = typename Base::const_iterator;
182 using theta_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
183 using resize_factor = typename theta_table::resize_factor;
184
185 // No constructor here. Use builder instead.
186 class builder;
187
193
199
200 virtual ~update_theta_sketch_alloc() = default;
201
208
215
216 virtual Allocator get_allocator() const;
217 virtual bool is_empty() const;
218 virtual bool is_ordered() const;
219 virtual uint16_t get_seed_hash() const;
220 virtual uint64_t get_theta64() const;
221 virtual uint32_t get_num_retained() const;
222
226 uint8_t get_lg_k() const;
227
231 resize_factor get_rf() const;
232
237 void update(const std::string& value);
238
243 void update(uint64_t value);
244
249 void update(int64_t value);
250
256 void update(uint32_t value);
257
263 void update(int32_t value);
264
270 void update(uint16_t value);
271
277 void update(int16_t value);
278
284 void update(uint8_t value);
285
291 void update(int8_t value);
292
298 void update(double value);
299
305 void update(float value);
306
320 void update(const void* data, size_t length);
321
325 void trim();
326
330 void reset();
331
337 compact_theta_sketch_alloc<Allocator> compact(bool ordered = true) const;
338
339 virtual iterator begin();
340 virtual iterator end();
341 virtual const_iterator begin() const;
342 virtual const_iterator end() const;
343
344private:
345 theta_table table_;
346
347 // for builder
348 update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
349 uint64_t theta, uint64_t seed, const Allocator& allocator);
350
351 virtual void print_specifics(std::ostringstream& os) const;
352};
353
358template<typename Allocator = std::allocator<uint64_t>>
360public:
362 using iterator = typename Base::iterator;
363 using const_iterator = typename Base::const_iterator;
364 using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
365 using vector_bytes = std::vector<uint8_t, AllocBytes>;
366
367 static const uint8_t UNCOMPRESSED_SERIAL_VERSION = 3;
368 static const uint8_t COMPRESSED_SERIAL_VERSION = 4;
369 static const uint8_t SKETCH_TYPE = 3;
370
371 // Instances of this type can be obtained:
372 // - by compacting an update_theta_sketch_alloc
373 // - as a result of a set operation
374 // - by deserializing a previously serialized compact sketch
375
382 template<typename Other>
383 compact_theta_sketch_alloc(const Other& other, bool ordered);
384
390
396
397 virtual ~compact_theta_sketch_alloc() = default;
398
405
412
413 virtual Allocator get_allocator() const;
414 virtual bool is_empty() const;
415 virtual bool is_ordered() const;
416 virtual uint64_t get_theta64() const;
417 virtual uint32_t get_num_retained() const;
418 virtual uint16_t get_seed_hash() const;
419
424 static size_t get_max_serialized_size_bytes(uint8_t lg_k);
425
432 size_t get_serialized_size_bytes(bool compressed = false) const;
433
438 void serialize(std::ostream& os) const;
439
447 vector_bytes serialize(unsigned header_size_bytes = 0) const;
448
455 void serialize_compressed(std::ostream& os) const;
456
466 vector_bytes serialize_compressed(unsigned header_size_bytes = 0) const;
467
468 virtual iterator begin();
469 virtual iterator end();
470 virtual const_iterator begin() const;
471 virtual const_iterator end() const;
472
481 uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
482
491 static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
492 uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
493
494private:
495 enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
496
497 bool is_empty_;
498 bool is_ordered_;
499 uint16_t seed_hash_;
500 uint64_t theta_;
501 std::vector<uint64_t, Allocator> entries_;
502
503 uint8_t get_preamble_longs(bool compressed) const;
504 bool is_suitable_for_compression() const;
505 uint8_t compute_entry_bits() const;
506 uint8_t get_num_entries_bytes() const;
507 size_t get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const;
508 void serialize_version_4(std::ostream& os) const;
509 vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
510
511 static compact_theta_sketch_alloc deserialize_v1(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
512 static compact_theta_sketch_alloc deserialize_v2(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
513 static compact_theta_sketch_alloc deserialize_v3(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
514 static compact_theta_sketch_alloc deserialize_v4(uint8_t preamble_longs, std::istream& is, uint64_t seed, const Allocator& allocator);
515
516 virtual void print_specifics(std::ostringstream& os) const;
517
518 template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_union_base;
519 template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
520 template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
521 compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
522};
523
525template<typename Allocator>
526class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
527public:
532 builder(const Allocator& allocator = Allocator());
534 update_theta_sketch_alloc build() const;
535};
536
542template<typename Allocator = std::allocator<uint64_t>>
544public:
545 class const_iterator;
546
547 Allocator get_allocator() const;
548 bool is_empty() const;
549 bool is_ordered() const;
550 uint64_t get_theta64() const;
551 uint32_t get_num_retained() const;
552 uint16_t get_seed_hash() const;
553
558 const_iterator begin() const;
559
565 const_iterator end() const;
566
575 static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
576
577protected:
578 virtual void print_specifics(std::ostringstream& os) const;
579 virtual void print_items(std::ostringstream& os) const;
580
581private:
582 using data_type = compact_theta_sketch_parser<true>::compact_theta_sketch_data;
583 data_type data_;
584
585 wrapped_compact_theta_sketch_alloc(const data_type& data);
586};
587
588template<typename Allocator>
589class wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator {
590public:
591 using iterator_category = std::input_iterator_tag;
592 using value_type = const uint64_t;
593 using difference_type = void;
594 using pointer = value_type*;
595 using reference = uint64_t;
596
597 const_iterator(const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index);
598 const_iterator& operator++();
599 const_iterator operator++(int);
600 bool operator==(const const_iterator& other) const;
601 bool operator!=(const const_iterator& other) const;
602 reference operator*() const;
603 pointer operator->() const;
604
605private:
606 const void* ptr_;
607 uint8_t entry_bits_;
608 uint32_t num_entries_;
609 uint32_t index_;
610 uint64_t previous_;
611 bool is_block_mode_;
612 uint8_t buf_i_;
613 uint8_t offset_;
614 uint64_t buffer_[8];
615};
616
617} /* namespace datasketches */
618
619#include "theta_sketch_impl.hpp"
620
621#endif
Abstract base class for Theta sketch.
Definition theta_sketch.hpp:45
double get_estimate() const
Definition theta_sketch_impl.hpp:47
double get_lower_bound(uint8_t num_std_devs) const
Returns the approximate lower error bound given a number of standard deviations.
Definition theta_sketch_impl.hpp:52
virtual bool is_ordered() const =0
virtual bool is_empty() const =0
virtual string< Allocator > to_string(bool print_items=false) const
Provides a human-readable summary of this sketch as a string.
Definition theta_sketch_impl.hpp:64
virtual uint32_t get_num_retained() const =0
double get_upper_bound(uint8_t num_std_devs) const
Returns the approximate upper error bound given a number of standard deviations.
Definition theta_sketch_impl.hpp:58
double get_theta() const
Definition theta_sketch_impl.hpp:41
virtual uint16_t get_seed_hash() const =0
bool is_estimation_mode() const
Definition theta_sketch_impl.hpp:36
virtual Allocator get_allocator() const =0
virtual uint64_t get_theta64() const =0
Compact Theta sketch.
Definition theta_sketch.hpp:359
compact_theta_sketch_alloc & operator=(compact_theta_sketch_alloc &&other)=default
Move assignment.
compact_theta_sketch_alloc(const compact_theta_sketch_alloc &other)=default
Copy constructor.
compact_theta_sketch_alloc & operator=(const compact_theta_sketch_alloc &other)=default
Copy assignment.
void serialize(std::ostream &os) const
This method serializes the sketch into a given stream in a binary form.
Definition theta_sketch_impl.hpp:378
virtual uint64_t get_theta64() const
Definition theta_sketch_impl.hpp:307
static compact_theta_sketch_alloc deserialize(const void *bytes, size_t size, uint64_t seed=DEFAULT_SEED, const Allocator &allocator=Allocator())
This method deserializes a sketch from a given array of bytes.
virtual uint32_t get_num_retained() const
Definition theta_sketch_impl.hpp:312
virtual bool is_empty() const
Definition theta_sketch_impl.hpp:297
virtual bool is_ordered() const
Definition theta_sketch_impl.hpp:302
virtual uint16_t get_seed_hash() const
Definition theta_sketch_impl.hpp:317
virtual iterator end()
Iterator pointing past the valid range.
Definition theta_sketch_impl.hpp:327
static size_t get_max_serialized_size_bytes(uint8_t lg_k)
Computes maximum serialized size in bytes.
Definition theta_sketch_impl.hpp:353
static compact_theta_sketch_alloc deserialize(std::istream &is, uint64_t seed=DEFAULT_SEED, const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
virtual Allocator get_allocator() const
Definition theta_sketch_impl.hpp:292
size_t get_serialized_size_bytes(bool compressed=false) const
Computes size in bytes required to serialize the current state of the sketch.
Definition theta_sketch_impl.hpp:358
compact_theta_sketch_alloc(compact_theta_sketch_alloc &&other) noexcept=default
Move constructor.
void serialize_compressed(std::ostream &os) const
This method serializes the sketch into a given stream in a compressed binary form.
Definition theta_sketch_impl.hpp:435
virtual iterator begin()
Iterator over hash values in this sketch.
Definition theta_sketch_impl.hpp:322
Theta base builder.
Definition theta_update_sketch_base.hpp:97
virtual const_iterator begin() const =0
Const iterator over hash values in this sketch.
virtual const_iterator end() const =0
Const iterator pointing past the valid range.
virtual iterator end()=0
Iterator pointing past the valid range.
virtual iterator begin()=0
Iterator over hash values in this sketch.
Update Theta sketch builder.
Definition theta_sketch.hpp:526
Update Theta sketch.
Definition theta_sketch.hpp:175
void trim()
Remove retained entries in excess of the nominal size k (if any)
Definition theta_sketch_impl.hpp:212
virtual uint64_t get_theta64() const
Definition theta_sketch_impl.hpp:121
virtual uint32_t get_num_retained() const
Definition theta_sketch_impl.hpp:126
resize_factor get_rf() const
Definition theta_sketch_impl.hpp:141
void update(const std::string &value)
Update this sketch with a given string.
Definition theta_sketch_impl.hpp:196
virtual bool is_empty() const
Definition theta_sketch_impl.hpp:111
virtual bool is_ordered() const
Definition theta_sketch_impl.hpp:116
virtual uint16_t get_seed_hash() const
Definition theta_sketch_impl.hpp:131
update_theta_sketch_alloc(update_theta_sketch_alloc &&other) noexcept=default
Move constructor.
virtual iterator end()
Iterator pointing past the valid range.
Definition theta_sketch_impl.hpp:227
uint8_t get_lg_k() const
Definition theta_sketch_impl.hpp:136
virtual Allocator get_allocator() const
Definition theta_sketch_impl.hpp:106
update_theta_sketch_alloc & operator=(const update_theta_sketch_alloc &other)=default
Copy assignment.
update_theta_sketch_alloc & operator=(update_theta_sketch_alloc &&other)=default
Move assignment.
void reset()
Reset the sketch to the initial empty state.
Definition theta_sketch_impl.hpp:217
virtual iterator begin()
Iterator over hash values in this sketch.
Definition theta_sketch_impl.hpp:222
compact_theta_sketch_alloc< Allocator > compact(bool ordered=true) const
Converts this sketch to a compact sketch (ordered or unordered).
Definition theta_sketch_impl.hpp:242
update_theta_sketch_alloc(const update_theta_sketch_alloc &other)=default
Copy constructor.
Wrapped Compact Theta sketch.
Definition theta_sketch.hpp:543
uint64_t get_theta64() const
Definition theta_sketch_impl.hpp:774
uint32_t get_num_retained() const
Definition theta_sketch_impl.hpp:779
bool is_empty() const
Definition theta_sketch_impl.hpp:764
bool is_ordered() const
Definition theta_sketch_impl.hpp:769
uint16_t get_seed_hash() const
Definition theta_sketch_impl.hpp:784
static const wrapped_compact_theta_sketch_alloc wrap(const void *bytes, size_t size, uint64_t seed=DEFAULT_SEED, bool dump_on_error=false)
This method wraps a serialized compact sketch as an array of bytes.
Definition theta_sketch_impl.hpp:754
Allocator get_allocator() const
Definition theta_sketch_impl.hpp:759
const_iterator begin() const
Const iterator over hash values in this sketch.
Definition theta_sketch_impl.hpp:789
const_iterator end() const
Const iterator pointing past the valid range.
Definition theta_sketch_impl.hpp:794
DataSketches namespace.
Definition binomial_bounds.hpp:38