datasketches-cpp
Loading...
Searching...
No Matches
theta_update_sketch_base.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef THETA_UPDATE_SKETCH_BASE_HPP_
21#define THETA_UPDATE_SKETCH_BASE_HPP_
22
23#include <vector>
24#include <climits>
25#include <cmath>
26#include <iterator>
27
28#include "MurmurHash3.h"
29#include "theta_comparators.hpp"
30#include "theta_constants.hpp"
31
32namespace datasketches {
33
34template<
35 typename Entry,
36 typename ExtractKey,
37 typename Allocator
38>
39struct theta_update_sketch_base {
40 using resize_factor = theta_constants::resize_factor;
41 using comparator = compare_by_key<ExtractKey>;
42
43 theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
44 uint64_t theta, uint64_t seed, const Allocator& allocator, bool is_empty = true);
45 theta_update_sketch_base(const theta_update_sketch_base& other);
46 theta_update_sketch_base(theta_update_sketch_base&& other) noexcept;
47 ~theta_update_sketch_base();
48 theta_update_sketch_base& operator=(const theta_update_sketch_base& other);
49 theta_update_sketch_base& operator=(theta_update_sketch_base&& other);
50
51 using iterator = Entry*;
52
53 inline uint64_t hash_and_screen(const void* data, size_t length);
54
55 inline std::pair<iterator, bool> find(uint64_t key) const;
56 static inline std::pair<iterator, bool> find(Entry* entries, uint8_t lg_size, uint64_t key);
57
58
59 template<typename FwdEntry>
60 inline void insert(iterator it, FwdEntry&& entry);
61
62 iterator begin() const;
63 iterator end() const;
64
65 // resize threshold = 0.5 tuned for speed
66 static constexpr double RESIZE_THRESHOLD = 0.5;
67 // hash table rebuild threshold = 15/16
68 static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0;
69
70 static constexpr uint8_t STRIDE_HASH_BITS = 7;
71 static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1;
72
73 Allocator allocator_;
74 bool is_empty_;
75 uint8_t lg_cur_size_;
76 uint8_t lg_nom_size_;
77 resize_factor rf_;
78 float p_;
79 uint32_t num_entries_;
80 uint64_t theta_;
81 uint64_t seed_;
82 Entry* entries_;
83
84 void resize();
85 void rebuild();
86 void trim();
87 void reset();
88
89 static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
90 static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
91 static void consolidate_non_empty(Entry* entries, size_t size, size_t num);
92};
93
94
96template<typename Derived, typename Allocator>
98public:
103 theta_base_builder(const Allocator& allocator);
104
110 Derived& set_lg_k(uint8_t lg_k);
111
117 Derived& set_resize_factor(resize_factor rf);
118
126 Derived& set_p(float p);
127
135 Derived& set_seed(uint64_t seed);
136
137protected:
138 Allocator allocator_;
139 uint8_t lg_k_;
140 resize_factor rf_;
141 float p_;
142 uint64_t seed_;
143
144 uint64_t starting_theta() const;
145 uint8_t starting_lg_size() const;
146};
147
148// key extractor
149
150struct trivial_extract_key {
151 template<typename T>
152 auto operator()(T&& entry) const -> decltype(std::forward<T>(entry)) {
153 return std::forward<T>(entry);
154 }
155};
156
157// key not zero
158
159template<typename Entry, typename ExtractKey>
160class key_not_zero {
161public:
162 bool operator()(const Entry& entry) const {
163 return ExtractKey()(entry) != 0;
164 }
165};
166
167template<typename Key, typename Entry, typename ExtractKey>
168class key_not_zero_less_than {
169public:
170 explicit key_not_zero_less_than(const Key& key): key(key) {}
171 bool operator()(const Entry& entry) const {
172 return ExtractKey()(entry) != 0 && ExtractKey()(entry) < this->key;
173 }
174private:
175 Key key;
176};
177
178// MurMur3 hash functions
179
180static inline uint64_t compute_hash(const void* data, size_t length, uint64_t seed) {
181 HashState hashes;
182 MurmurHash3_x64_128(data, length, seed, hashes);
183 return (hashes.h1 >> 1); // Java implementation does unsigned shift >>> to make values positive
184}
185
186// iterators
187
188template<typename Entry, typename ExtractKey>
189class theta_iterator {
190public:
191 using iterator_category = std::input_iterator_tag;
192 using value_type = Entry;
193 using difference_type = std::ptrdiff_t;
194 using pointer = Entry*;
195 using reference = Entry&;
196
197 theta_iterator(Entry* entries, uint32_t size, uint32_t index);
198 theta_iterator& operator++();
199 theta_iterator operator++(int);
200 bool operator==(const theta_iterator& other) const;
201 bool operator!=(const theta_iterator& other) const;
202 reference operator*() const;
203 pointer operator->() const;
204
205private:
206 Entry* entries_;
207 uint32_t size_;
208 uint32_t index_;
209};
210
211template<typename Entry, typename ExtractKey>
212class theta_const_iterator {
213public:
214 using iterator_category = std::input_iterator_tag;
215 using value_type = const Entry;
216 using difference_type = std::ptrdiff_t;
217 using pointer = const Entry*;
218 using reference = const Entry&;
219
220 theta_const_iterator(const Entry* entries, uint32_t size, uint32_t index);
221 theta_const_iterator& operator++();
222 theta_const_iterator operator++(int);
223 bool operator==(const theta_const_iterator& other) const;
224 bool operator!=(const theta_const_iterator& other) const;
225 reference operator*() const;
226 pointer operator->() const;
227
228private:
229 const Entry* entries_;
230 uint32_t size_;
231 uint32_t index_;
232};
233
234// double value canonicalization for compatibility with Java
235static inline int64_t canonical_double(double value) {
236 union {
237 int64_t long_value;
238 double double_value;
239 } long_double_union;
240
241 if (value == 0.0) {
242 long_double_union.double_value = 0.0; // canonicalize -0.0 to 0.0
243 } else if (std::isnan(value)) {
244 long_double_union.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
245 } else {
246 long_double_union.double_value = value;
247 }
248 return long_double_union.long_value;
249}
250
251} /* namespace datasketches */
252
253#include "theta_update_sketch_base_impl.hpp"
254
255#endif
Theta base builder.
Definition theta_update_sketch_base.hpp:97
Derived & set_lg_k(uint8_t lg_k)
Set log2(k), where k is a nominal number of entries in the sketch.
Definition theta_update_sketch_base_impl.hpp:312
Derived & set_resize_factor(resize_factor rf)
Set resize factor for the internal hash table (defaults to 8)
Definition theta_update_sketch_base_impl.hpp:324
Derived & set_p(float p)
Set sampling probability (initial theta).
Definition theta_update_sketch_base_impl.hpp:330
Derived & set_seed(uint64_t seed)
Set the seed for the hash function.
Definition theta_update_sketch_base_impl.hpp:337
datasketches::resize_factor resize_factor
hash table resize factor
Definition theta_constants.hpp:31
DataSketches namespace.
Definition binomial_bounds.hpp:38