datasketches-cpp
HllUtil.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef _HLLUTIL_HPP_
21 #define _HLLUTIL_HPP_
22 
23 #include "MurmurHash3.h"
24 #include "RelativeErrorTables.hpp"
25 #include "count_zeros.hpp"
26 #include "common_defs.hpp"
27 #include "ceiling_power_of_2.hpp"
28 
29 #include <cmath>
30 #include <stdexcept>
31 #include <string>
32 
33 namespace datasketches {
34 
35 enum hll_mode { LIST = 0, SET, HLL };
36 
37 namespace hll_constants {
38 
39 // preamble stuff
40 static const uint8_t SER_VER = 1;
41 static const uint8_t FAMILY_ID = 7;
42 
43 static const uint8_t EMPTY_FLAG_MASK = 4;
44 static const uint8_t COMPACT_FLAG_MASK = 8;
45 static const uint8_t OUT_OF_ORDER_FLAG_MASK = 16;
46 static const uint8_t FULL_SIZE_FLAG_MASK = 32;
47 
48 static const uint32_t PREAMBLE_INTS_BYTE = 0;
49 static const uint32_t SER_VER_BYTE = 1;
50 static const uint32_t FAMILY_BYTE = 2;
51 static const uint32_t LG_K_BYTE = 3;
52 static const uint32_t LG_ARR_BYTE = 4;
53 static const uint32_t FLAGS_BYTE = 5;
54 static const uint32_t LIST_COUNT_BYTE = 6;
55 static const uint32_t HLL_CUR_MIN_BYTE = 6;
56 static const uint32_t MODE_BYTE = 7; // lo2bits = curMode, next 2 bits = tgtHllMode
57 
58 // Coupon List
59 static const uint32_t LIST_INT_ARR_START = 8;
60 static const uint8_t LIST_PREINTS = 2;
61 // Coupon Hash Set
62 static const uint32_t HASH_SET_COUNT_INT = 8;
63 static const uint32_t HASH_SET_INT_ARR_START = 12;
64 static const uint8_t HASH_SET_PREINTS = 3;
65 // HLL
66 static const uint8_t HLL_PREINTS = 10;
67 static const uint32_t HLL_BYTE_ARR_START = 40;
68 static const uint32_t HIP_ACCUM_DOUBLE = 8;
69 static const uint32_t KXQ0_DOUBLE = 16;
70 static const uint32_t KXQ1_DOUBLE = 24;
71 static const uint32_t CUR_MIN_COUNT_INT = 32;
72 static const uint32_t AUX_COUNT_INT = 36;
73 
74 static const uint32_t EMPTY_SKETCH_SIZE_BYTES = 8;
75 
76 // other HllUtil stuff
77 static const uint8_t KEY_BITS_26 = 26;
78 static const uint8_t VAL_BITS_6 = 6;
79 static const uint32_t KEY_MASK_26 = (1 << KEY_BITS_26) - 1;
80 static const uint32_t VAL_MASK_6 = (1 << VAL_BITS_6) - 1;
81 static const uint32_t EMPTY = 0;
82 static const uint8_t MIN_LOG_K = 4;
83 static const uint8_t MAX_LOG_K = 21;
84 
85 static const double HLL_HIP_RSE_FACTOR = 0.8325546; // sqrt(ln(2))
86 static const double HLL_NON_HIP_RSE_FACTOR = 1.03896; // sqrt((3 * ln(2)) - 1)
87 static const double COUPON_RSE_FACTOR = 0.409; // at transition point not the asymptote
88 static const double COUPON_RSE = COUPON_RSE_FACTOR / (1 << 13);
89 
90 static const uint8_t LG_INIT_LIST_SIZE = 3;
91 static const uint8_t LG_INIT_SET_SIZE = 5;
92 static const uint32_t RESIZE_NUMER = 3;
93 static const uint32_t RESIZE_DENOM = 4;
94 
95 static const uint8_t loNibbleMask = 0x0f;
96 static const uint8_t hiNibbleMask = 0xf0;
97 static const uint8_t AUX_TOKEN = 0xf;
98 
103 static const uint8_t LG_AUX_ARR_INTS[] = {
104  0, 2, 2, 2, 2, 2, 2, 3, 3, 3, // 0 - 9
105  4, 4, 5, 5, 6, 7, 8, 9, 10, 11, // 10-19
106  12, 13, 14, 15, 16, 17, 18 // 20-26
107 };
108 
109 } // namespace hll_constants
110 
111 
112 // template provides internal consistency and allows static float values
113 // but we don't use the template parameter anywhere
114 template<typename A = std::allocator<uint8_t> >
115 class HllUtil final {
116 public:
117 
118  static uint32_t coupon(const uint64_t hash[]);
119  static uint32_t coupon(const HashState& hashState);
120  static void hash(const void* key, size_t keyLen, uint64_t seed, HashState& result);
121  static uint8_t checkLgK(uint8_t lgK);
122  static void checkMemSize(uint64_t minBytes, uint64_t capBytes);
123  static inline void checkNumStdDev(uint8_t numStdDev);
124  static uint32_t pair(uint32_t slotNo, uint8_t value);
125  static uint32_t getLow26(uint32_t coupon);
126  static uint8_t getValue(uint32_t coupon);
127  static uint8_t simpleIntLog2(uint32_t n); // n must be power of 2
128  static uint8_t computeLgArrInts(hll_mode mode, uint32_t count, uint8_t lgConfigK);
129  static double getRelErr(bool upperBound, bool unioned, uint8_t lgConfigK, uint8_t numStdDev);
130 };
131 
132 template<typename A>
133 inline uint32_t HllUtil<A>::coupon(const uint64_t hash[]) {
134  uint32_t addr26 = hash[0] & hll_constants::KEY_MASK_26;
135  uint8_t lz = count_leading_zeros_in_u64(hash[1]);
136  uint8_t value = ((lz > 62 ? 62 : lz) + 1);
137  return (value << hll_constants::KEY_BITS_26) | addr26;
138 }
139 
140 template<typename A>
141 inline uint32_t HllUtil<A>::coupon(const HashState& hashState) {
142  uint32_t addr26 = (int) (hashState.h1 & hll_constants::KEY_MASK_26);
143  uint8_t lz = count_leading_zeros_in_u64(hashState.h2);
144  uint8_t value = ((lz > 62 ? 62 : lz) + 1);
145  return (value << hll_constants::KEY_BITS_26) | addr26;
146 }
147 
148 template<typename A>
149 inline void HllUtil<A>::hash(const void* key, size_t keyLen, uint64_t seed, HashState& result) {
150  MurmurHash3_x64_128(key, keyLen, seed, result);
151 }
152 
153 template<typename A>
154 inline uint8_t HllUtil<A>::checkLgK(uint8_t lgK) {
155  if ((lgK >= hll_constants::MIN_LOG_K) && (lgK <= hll_constants::MAX_LOG_K)) {
156  return lgK;
157  } else {
158  throw std::invalid_argument("Invalid value of k: " + std::to_string(lgK));
159  }
160 }
161 
162 template<typename A>
163 inline double HllUtil<A>::getRelErr(bool upperBound, bool unioned,
164  uint8_t lgConfigK, uint8_t numStdDev) {
165  checkLgK(lgConfigK);
166  if (lgConfigK > 12) {
167  const double rseFactor = unioned ?
168  hll_constants::HLL_NON_HIP_RSE_FACTOR : hll_constants::HLL_HIP_RSE_FACTOR;
169  const uint32_t configK = 1 << lgConfigK;
170  return (upperBound ? -1 : 1) * (numStdDev * rseFactor) / sqrt(configK);
171  } else {
172  return RelativeErrorTables<A>::getRelErr(upperBound, unioned, lgConfigK, numStdDev);
173  }
174 }
175 
176 template<typename A>
177 inline void HllUtil<A>::checkMemSize(uint64_t minBytes, uint64_t capBytes) {
178  if (capBytes < minBytes) {
179  throw std::invalid_argument("Given destination array is not large enough: " + std::to_string(capBytes));
180  }
181 }
182 
183 template<typename A>
184 inline void HllUtil<A>::checkNumStdDev(uint8_t numStdDev) {
185  if ((numStdDev < 1) || (numStdDev > 3)) {
186  throw std::invalid_argument("NumStdDev may not be less than 1 or greater than 3.");
187  }
188 }
189 
190 template<typename A>
191 inline uint32_t HllUtil<A>::pair(uint32_t slotNo, uint8_t value) {
192  return (value << hll_constants::KEY_BITS_26) | (slotNo & hll_constants::KEY_MASK_26);
193 }
194 
195 template<typename A>
196 inline uint32_t HllUtil<A>::getLow26(uint32_t coupon) {
197  return coupon & hll_constants::KEY_MASK_26;
198 }
199 
200 template<typename A>
201 inline uint8_t HllUtil<A>::getValue(uint32_t coupon) {
202  return coupon >> hll_constants::KEY_BITS_26;
203 }
204 
205 template<typename A>
206 inline uint8_t HllUtil<A>::simpleIntLog2(uint32_t n) {
207  if (n == 0) {
208  throw std::logic_error("cannot take log of 0");
209  }
210  return count_trailing_zeros_in_u32(n);
211 }
212 
213 template<typename A>
214 inline uint8_t HllUtil<A>::computeLgArrInts(hll_mode mode, uint32_t count, uint8_t lgConfigK) {
215  // assume value missing and recompute
216  if (mode == LIST) { return hll_constants::LG_INIT_LIST_SIZE; }
217  uint32_t ceilPwr2 = ceiling_power_of_2(count);
218  if ((hll_constants::RESIZE_DENOM * count) > (hll_constants::RESIZE_NUMER * ceilPwr2)) { ceilPwr2 <<= 1;}
219  if (mode == SET) {
220  return std::max(hll_constants::LG_INIT_SET_SIZE, HllUtil<A>::simpleIntLog2(ceilPwr2));
221  }
222  //only used for HLL4
223  return std::max(hll_constants::LG_AUX_ARR_INTS[lgConfigK], HllUtil<A>::simpleIntLog2(ceilPwr2));
224 }
225 
226 }
227 
228 #endif /* _HLLUTIL_HPP_ */
DataSketches namespace.
Definition: binomial_bounds.hpp:38