datasketches-cpp
theta_update_sketch_base.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef THETA_UPDATE_SKETCH_BASE_HPP_
21 #define THETA_UPDATE_SKETCH_BASE_HPP_
22 
23 #include <vector>
24 #include <climits>
25 #include <cmath>
26 #include <iterator>
27 
28 #include "MurmurHash3.h"
29 #include "theta_comparators.hpp"
30 #include "theta_constants.hpp"
31 
32 namespace datasketches {
33 
34 template<
35  typename Entry,
36  typename ExtractKey,
37  typename Allocator
38 >
39 struct theta_update_sketch_base {
40  using resize_factor = theta_constants::resize_factor;
41  using comparator = compare_by_key<ExtractKey>;
42 
43  theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
44  uint64_t theta, uint64_t seed, const Allocator& allocator, bool is_empty = true);
45  theta_update_sketch_base(const theta_update_sketch_base& other);
46  theta_update_sketch_base(theta_update_sketch_base&& other) noexcept;
47  ~theta_update_sketch_base();
48  theta_update_sketch_base& operator=(const theta_update_sketch_base& other);
49  theta_update_sketch_base& operator=(theta_update_sketch_base&& other);
50 
51  using iterator = Entry*;
52 
53  inline uint64_t hash_and_screen(const void* data, size_t length);
54 
55  inline std::pair<iterator, bool> find(uint64_t key) const;
56  static inline std::pair<iterator, bool> find(Entry* entries, uint8_t lg_size, uint64_t key);
57 
58 
59  template<typename FwdEntry>
60  inline void insert(iterator it, FwdEntry&& entry);
61 
62  iterator begin() const;
63  iterator end() const;
64 
65  // resize threshold = 0.5 tuned for speed
66  static constexpr double RESIZE_THRESHOLD = 0.5;
67  // hash table rebuild threshold = 15/16
68  static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0;
69 
70  static constexpr uint8_t STRIDE_HASH_BITS = 7;
71  static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1;
72 
73  Allocator allocator_;
74  bool is_empty_;
75  uint8_t lg_cur_size_;
76  uint8_t lg_nom_size_;
77  resize_factor rf_;
78  float p_;
79  uint32_t num_entries_;
80  uint64_t theta_;
81  uint64_t seed_;
82  Entry* entries_;
83 
84  void resize();
85  void rebuild();
86  void trim();
87  void reset();
88 
89  static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
90  static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
91  static void consolidate_non_empty(Entry* entries, size_t size, size_t num);
92 };
93 
94 
96 template<typename Derived, typename Allocator>
98 public:
103  theta_base_builder(const Allocator& allocator);
104 
110  Derived& set_lg_k(uint8_t lg_k);
111 
117  Derived& set_resize_factor(resize_factor rf);
118 
126  Derived& set_p(float p);
127 
135  Derived& set_seed(uint64_t seed);
136 
137 protected:
138  Allocator allocator_;
139  uint8_t lg_k_;
140  resize_factor rf_;
141  float p_;
142  uint64_t seed_;
143 
144  uint64_t starting_theta() const;
145  uint8_t starting_lg_size() const;
146 };
147 
148 // key extractor
149 
150 struct trivial_extract_key {
151  template<typename T>
152  auto operator()(T&& entry) const -> decltype(std::forward<T>(entry)) {
153  return std::forward<T>(entry);
154  }
155 };
156 
157 // key not zero
158 
159 template<typename Entry, typename ExtractKey>
160 class key_not_zero {
161 public:
162  bool operator()(const Entry& entry) const {
163  return ExtractKey()(entry) != 0;
164  }
165 };
166 
167 template<typename Key, typename Entry, typename ExtractKey>
168 class key_not_zero_less_than {
169 public:
170  explicit key_not_zero_less_than(const Key& key): key(key) {}
171  bool operator()(const Entry& entry) const {
172  return ExtractKey()(entry) != 0 && ExtractKey()(entry) < this->key;
173  }
174 private:
175  Key key;
176 };
177 
178 // MurMur3 hash functions
179 
180 static inline uint64_t compute_hash(const void* data, size_t length, uint64_t seed) {
181  HashState hashes;
182  MurmurHash3_x64_128(data, length, seed, hashes);
183  return (hashes.h1 >> 1); // Java implementation does unsigned shift >>> to make values positive
184 }
185 
186 // iterators
187 
188 template<typename Entry, typename ExtractKey>
189 class theta_iterator {
190 public:
191  using iterator_category = std::input_iterator_tag;
192  using value_type = Entry;
193  using difference_type = std::ptrdiff_t;
194  using pointer = Entry*;
195  using reference = Entry&;
196 
197  theta_iterator(Entry* entries, uint32_t size, uint32_t index);
198  theta_iterator& operator++();
199  theta_iterator operator++(int);
200  bool operator==(const theta_iterator& other) const;
201  bool operator!=(const theta_iterator& other) const;
202  reference operator*() const;
203  pointer operator->() const;
204 
205 private:
206  Entry* entries_;
207  uint32_t size_;
208  uint32_t index_;
209 };
210 
211 template<typename Entry, typename ExtractKey>
212 class theta_const_iterator {
213 public:
214  using iterator_category = std::input_iterator_tag;
215  using value_type = const Entry;
216  using difference_type = std::ptrdiff_t;
217  using pointer = const Entry*;
218  using reference = const Entry&;
219 
220  theta_const_iterator(const Entry* entries, uint32_t size, uint32_t index);
221  theta_const_iterator& operator++();
222  theta_const_iterator operator++(int);
223  bool operator==(const theta_const_iterator& other) const;
224  bool operator!=(const theta_const_iterator& other) const;
225  reference operator*() const;
226  pointer operator->() const;
227 
228 private:
229  const Entry* entries_;
230  uint32_t size_;
231  uint32_t index_;
232 };
233 
234 // double value canonicalization for compatibility with Java
235 static inline int64_t canonical_double(double value) {
236  union {
237  int64_t long_value;
238  double double_value;
239  } long_double_union;
240 
241  if (value == 0.0) {
242  long_double_union.double_value = 0.0; // canonicalize -0.0 to 0.0
243  } else if (std::isnan(value)) {
244  long_double_union.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
245  } else {
246  long_double_union.double_value = value;
247  }
248  return long_double_union.long_value;
249 }
250 
251 } /* namespace datasketches */
252 
253 #include "theta_update_sketch_base_impl.hpp"
254 
255 #endif
Theta base builder.
Definition: theta_update_sketch_base.hpp:97
Derived & set_lg_k(uint8_t lg_k)
Set log2(k), where k is a nominal number of entries in the sketch.
Definition: theta_update_sketch_base_impl.hpp:312
theta_base_builder(const Allocator &allocator)
Creates and instance of the builder with default parameters.
Definition: theta_update_sketch_base_impl.hpp:304
Derived & set_resize_factor(resize_factor rf)
Set resize factor for the internal hash table (defaults to 8)
Definition: theta_update_sketch_base_impl.hpp:324
Derived & set_p(float p)
Set sampling probability (initial theta).
Definition: theta_update_sketch_base_impl.hpp:330
Derived & set_seed(uint64_t seed)
Set the seed for the hash function.
Definition: theta_update_sketch_base_impl.hpp:337
datasketches::resize_factor resize_factor
hash table resize factor
Definition: theta_constants.hpp:31
DataSketches namespace.
Definition: binomial_bounds.hpp:38