datasketches-cpp
Loading...
Searching...
No Matches
HllArray-internal.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef _HLLARRAY_INTERNAL_HPP_
21#define _HLLARRAY_INTERNAL_HPP_
22
23#include "HllArray.hpp"
24#include "HllUtil.hpp"
25#include "HarmonicNumbers.hpp"
26#include "CubicInterpolation.hpp"
27#include "CompositeInterpolationXTable.hpp"
28#include "CouponList.hpp"
29#include "inv_pow2_table.hpp"
30#include <cstring>
31#include <cmath>
32#include <stdexcept>
33#include <string>
34
35namespace datasketches {
36
37template<typename A>
38HllArray<A>::HllArray(uint8_t lgConfigK, target_hll_type tgtHllType, bool startFullSize, const A& allocator):
39HllSketchImpl<A>(lgConfigK, tgtHllType, hll_mode::HLL, startFullSize),
40hipAccum_(0.0),
41kxq0_(1 << lgConfigK),
42kxq1_(0.0),
43hllByteArr_(allocator),
44curMin_(0),
45numAtCurMin_(1 << lgConfigK),
46oooFlag_(false),
47rebuild_kxq_curmin_(false)
48{}
49
50template<typename A>
51HllArray<A>::HllArray(const HllArray& other, target_hll_type tgtHllType) :
52 HllSketchImpl<A>(other.getLgConfigK(), tgtHllType, hll_mode::HLL, other.isStartFullSize()),
53 // remaining fields are initialized to empty sketch defaults
54 // and left to subclass constructor to populate
55 hipAccum_(0.0),
56 kxq0_(1 << other.getLgConfigK()),
57 kxq1_(0.0),
58 hllByteArr_(other.getAllocator()),
59 curMin_(0),
60 numAtCurMin_(1 << other.getLgConfigK()),
61 oooFlag_(false),
62 rebuild_kxq_curmin_(false)
63{}
64
65template<typename A>
66HllArray<A>* HllArray<A>::copyAs(target_hll_type tgtHllType) const {
67 // we may need to recompute KxQ and curMin data for a union gadget,
68 // so only use a direct copy if we have a valid sketch
69 if (tgtHllType == this->getTgtHllType() && !this->isRebuildKxqCurminFlag()) {
70 return static_cast<HllArray*>(copy());
71 }
72
73 // the factory methods replay the coupons and will always rebuild
74 // the sketch in a consistent way
75 switch (tgtHllType) {
76 case target_hll_type::HLL_4:
77 return HllSketchImplFactory<A>::convertToHll4(*this);
78 case target_hll_type::HLL_6:
79 return HllSketchImplFactory<A>::convertToHll6(*this);
80 case target_hll_type::HLL_8:
81 return HllSketchImplFactory<A>::convertToHll8(*this);
82 default:
83 throw std::invalid_argument("Invalid target HLL type");
84 }
85}
86
87template<typename A>
88HllArray<A>* HllArray<A>::newHll(const void* bytes, size_t len, const A& allocator) {
89 if (len < hll_constants::HLL_BYTE_ARR_START) {
90 throw std::out_of_range("Input data length insufficient to hold HLL array");
91 }
92
93 const uint8_t* data = static_cast<const uint8_t*>(bytes);
94 if (data[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::HLL_PREINTS) {
95 throw std::invalid_argument("Incorrect number of preInts in input stream");
96 }
97 if (data[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
98 throw std::invalid_argument("Wrong ser ver in input stream");
99 }
100 if (data[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
101 throw std::invalid_argument("Input array is not an HLL sketch");
102 }
103
104 const hll_mode mode = HllSketchImpl<A>::extractCurMode(data[hll_constants::MODE_BYTE]);
105 if (mode != HLL) {
106 throw std::invalid_argument("Calling HLL array constructor with non-HLL mode data");
107 }
108
109 const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(data[hll_constants::MODE_BYTE]);
110 const bool oooFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::OUT_OF_ORDER_FLAG_MASK) ? true : false);
111 const bool comapctFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
112 const bool startFullSizeFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::FULL_SIZE_FLAG_MASK) ? true : false);
113
114 const uint8_t lgK = data[hll_constants::LG_K_BYTE];
115 const uint8_t curMin = data[hll_constants::HLL_CUR_MIN_BYTE];
116
117 const uint32_t arrayBytes = hllArrBytes(tgtHllType, lgK);
118 if (len < static_cast<size_t>(hll_constants::HLL_BYTE_ARR_START + arrayBytes)) {
119 throw std::out_of_range("Input array too small to hold sketch image");
120 }
121
122 double hip, kxq0, kxq1;
123 std::memcpy(&hip, data + hll_constants::HIP_ACCUM_DOUBLE, sizeof(double));
124 std::memcpy(&kxq0, data + hll_constants::KXQ0_DOUBLE, sizeof(double));
125 std::memcpy(&kxq1, data + hll_constants::KXQ1_DOUBLE, sizeof(double));
126
127 uint32_t numAtCurMin, auxCount;
128 std::memcpy(&numAtCurMin, data + hll_constants::CUR_MIN_COUNT_INT, sizeof(int));
129 std::memcpy(&auxCount, data + hll_constants::AUX_COUNT_INT, sizeof(int));
130
131 AuxHashMap<A>* auxHashMap = nullptr;
132 typedef std::unique_ptr<AuxHashMap<A>, std::function<void(AuxHashMap<A>*)>> aux_hash_map_ptr;
133 aux_hash_map_ptr aux_ptr;
134 if (auxCount > 0) { // necessarily TgtHllType == HLL_4
135 uint8_t auxLgIntArrSize = data[4];
136 const size_t offset = hll_constants::HLL_BYTE_ARR_START + arrayBytes;
137 const uint8_t* auxDataStart = data + offset;
138 auxHashMap = AuxHashMap<A>::deserialize(auxDataStart, len - offset, lgK, auxCount, auxLgIntArrSize, comapctFlag, allocator);
139 aux_ptr = aux_hash_map_ptr(auxHashMap, auxHashMap->make_deleter());
140 }
141
142 HllArray<A>* sketch = HllSketchImplFactory<A>::newHll(lgK, tgtHllType, startFullSizeFlag, allocator);
143 sketch->putCurMin(curMin);
144 sketch->putOutOfOrderFlag(oooFlag);
145 if (!oooFlag) sketch->putHipAccum(hip);
146 sketch->putKxQ0(kxq0);
147 sketch->putKxQ1(kxq1);
148 sketch->putNumAtCurMin(numAtCurMin);
149
150 std::memcpy(sketch->hllByteArr_.data(), data + hll_constants::HLL_BYTE_ARR_START, arrayBytes);
151
152 if (auxHashMap != nullptr)
153 ((Hll4Array<A>*)sketch)->putAuxHashMap(auxHashMap);
154
155 aux_ptr.release();
156 return sketch;
157}
158
159template<typename A>
160HllArray<A>* HllArray<A>::newHll(std::istream& is, const A& allocator) {
161 uint8_t listHeader[8];
162 read(is, listHeader, 8 * sizeof(uint8_t));
163
164 if (listHeader[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::HLL_PREINTS) {
165 throw std::invalid_argument("Incorrect number of preInts in input stream");
166 }
167 if (listHeader[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
168 throw std::invalid_argument("Wrong ser ver in input stream");
169 }
170 if (listHeader[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
171 throw std::invalid_argument("Input stream is not an HLL sketch");
172 }
173
174 hll_mode mode = HllSketchImpl<A>::extractCurMode(listHeader[hll_constants::MODE_BYTE]);
175 if (mode != HLL) {
176 throw std::invalid_argument("Calling HLL construtor with non-HLL mode data");
177 }
178
179 const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(listHeader[hll_constants::MODE_BYTE]);
180 const bool oooFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::OUT_OF_ORDER_FLAG_MASK) ? true : false);
181 const bool comapctFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
182 const bool startFullSizeFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::FULL_SIZE_FLAG_MASK) ? true : false);
183
184 const uint8_t lgK = listHeader[hll_constants::LG_K_BYTE];
185 const uint8_t curMin = listHeader[hll_constants::HLL_CUR_MIN_BYTE];
186
187 HllArray* sketch = HllSketchImplFactory<A>::newHll(lgK, tgtHllType, startFullSizeFlag, allocator);
188 typedef std::unique_ptr<HllArray<A>, std::function<void(HllSketchImpl<A>*)>> hll_array_ptr;
189 hll_array_ptr sketch_ptr(sketch, sketch->get_deleter());
190 sketch->putCurMin(curMin);
191 sketch->putOutOfOrderFlag(oooFlag);
192
193 const auto hip = read<double>(is);
194 const auto kxq0 = read<double>(is);
195 const auto kxq1 = read<double>(is);
196 if (!oooFlag) sketch->putHipAccum(hip);
197 sketch->putKxQ0(kxq0);
198 sketch->putKxQ1(kxq1);
199
200 const auto numAtCurMin = read<uint32_t>(is);
201 const auto auxCount = read<uint32_t>(is);
202 sketch->putNumAtCurMin(numAtCurMin);
203
204 read(is, sketch->hllByteArr_.data(), sketch->getHllByteArrBytes());
205
206 if (auxCount > 0) { // necessarily TgtHllType == HLL_4
207 uint8_t auxLgIntArrSize = listHeader[4];
208 AuxHashMap<A>* auxHashMap = AuxHashMap<A>::deserialize(is, lgK, auxCount, auxLgIntArrSize, comapctFlag, allocator);
209 ((Hll4Array<A>*)sketch)->putAuxHashMap(auxHashMap);
210 }
211
212 if (!is.good())
213 throw std::runtime_error("error reading from std::istream");
214
215 return sketch_ptr.release();
216}
217
218template<typename A>
219auto HllArray<A>::serialize(bool compact, unsigned header_size_bytes) const -> vector_bytes {
220 const size_t sketchSizeBytes = (compact ? getCompactSerializationBytes() : getUpdatableSerializationBytes()) + header_size_bytes;
221 vector_bytes byteArr(sketchSizeBytes, 0, getAllocator());
222 uint8_t* bytes = byteArr.data() + header_size_bytes;
223 AuxHashMap<A>* auxHashMap = getAuxHashMap();
224
225 bytes[hll_constants::PREAMBLE_INTS_BYTE] = static_cast<uint8_t>(getPreInts());
226 bytes[hll_constants::SER_VER_BYTE] = static_cast<uint8_t>(hll_constants::SER_VER);
227 bytes[hll_constants::FAMILY_BYTE] = static_cast<uint8_t>(hll_constants::FAMILY_ID);
228 bytes[hll_constants::LG_K_BYTE] = static_cast<uint8_t>(this->lgConfigK_);
229 bytes[hll_constants::LG_ARR_BYTE] = static_cast<uint8_t>(auxHashMap == nullptr ? 0 : auxHashMap->getLgAuxArrInts());
230 bytes[hll_constants::FLAGS_BYTE] = this->makeFlagsByte(compact);
231 bytes[hll_constants::HLL_CUR_MIN_BYTE] = static_cast<uint8_t>(curMin_);
232 bytes[hll_constants::MODE_BYTE] = this->makeModeByte();
233
234 std::memcpy(bytes + hll_constants::HIP_ACCUM_DOUBLE, &hipAccum_, sizeof(double));
235 std::memcpy(bytes + hll_constants::KXQ0_DOUBLE, &kxq0_, sizeof(double));
236 std::memcpy(bytes + hll_constants::KXQ1_DOUBLE, &kxq1_, sizeof(double));
237 std::memcpy(bytes + hll_constants::CUR_MIN_COUNT_INT, &numAtCurMin_, sizeof(uint32_t));
238 const uint32_t auxCount = (auxHashMap == nullptr ? 0 : auxHashMap->getAuxCount());
239 std::memcpy(bytes + hll_constants::AUX_COUNT_INT, &auxCount, sizeof(uint32_t));
240
241 const uint32_t hllByteArrBytes = getHllByteArrBytes();
242 std::memcpy(bytes + getMemDataStart(), hllByteArr_.data(), hllByteArrBytes);
243
244 // aux map if HLL_4
245 if (this->tgtHllType_ == HLL_4) {
246 bytes += getMemDataStart() + hllByteArrBytes; // start of auxHashMap
247 if (auxHashMap != nullptr) {
248 if (compact) {
249 for (const uint32_t coupon: *auxHashMap) {
250 std::memcpy(bytes, &coupon, sizeof(coupon));
251 bytes += sizeof(coupon);
252 }
253 } else {
254 std::memcpy(bytes, auxHashMap->getAuxIntArr(), auxHashMap->getUpdatableSizeBytes());
255 }
256 } else if (!compact) {
257 // if updatable, we write even if currently unused so the binary can be wrapped
258 uint32_t auxBytes = 4 << hll_constants::LG_AUX_ARR_INTS[this->lgConfigK_];
259 std::fill_n(bytes, auxBytes, static_cast<uint8_t>(0));
260 }
261 }
262
263 return byteArr;
264}
265
266template<typename A>
267void HllArray<A>::serialize(std::ostream& os, bool compact) const {
268 // header
269 const uint8_t preInts = getPreInts();
270 write(os, preInts);
271 const uint8_t serialVersion = hll_constants::SER_VER;
272 write(os, serialVersion);
273 const uint8_t familyId = hll_constants::FAMILY_ID;
274 write(os, familyId);
275 const uint8_t lgKByte = this->lgConfigK_;
276 write(os, lgKByte);
277
278 AuxHashMap<A>* auxHashMap = getAuxHashMap();
279 uint8_t lgArrByte = 0;
280 if (auxHashMap != nullptr) {
281 lgArrByte = auxHashMap->getLgAuxArrInts();
282 }
283 write(os, lgArrByte);
284
285 const uint8_t flagsByte = this->makeFlagsByte(compact);
286 write(os, flagsByte);
287 write(os, curMin_);
288 const uint8_t modeByte = this->makeModeByte();
289 write(os, modeByte);
290
291 // estimator data
292 write(os, hipAccum_);
293 write(os, kxq0_);
294 write(os, kxq1_);
295
296 // array data
297 write(os, numAtCurMin_);
298
299 const uint32_t auxCount = (auxHashMap == nullptr ? 0 : auxHashMap->getAuxCount());
300 write(os, auxCount);
301 write(os, hllByteArr_.data(), getHllByteArrBytes());
302
303 // aux map if HLL_4
304 if (this->tgtHllType_ == HLL_4) {
305 if (auxHashMap != nullptr) {
306 if (compact) {
307 for (const uint32_t coupon: *auxHashMap) {
308 write(os, coupon);
309 }
310 } else {
311 write(os, auxHashMap->getAuxIntArr(), auxHashMap->getUpdatableSizeBytes());
312 }
313 } else if (!compact) {
314 // if updatable, we write even if currently unused so the binary can be wrapped
315 uint32_t auxBytes = 4 << hll_constants::LG_AUX_ARR_INTS[this->lgConfigK_];
316 std::fill_n(std::ostreambuf_iterator<char>(os), auxBytes, static_cast<char>(0));
317 }
318 }
319}
320
321template<typename A>
322double HllArray<A>::getEstimate() const {
323 if (oooFlag_) {
324 return getCompositeEstimate();
325 }
326 return hipAccum_;
327}
328
329// HLL UPPER AND LOWER BOUNDS
330
331/*
332 * The upper and lower bounds are not symmetric and thus are treated slightly differently.
333 * For the lower bound, when the unique count is <= k, LB >= numNonZeros, where
334 * numNonZeros = k - numAtCurMin AND curMin == 0.
335 *
336 * For HLL6 and HLL8, curMin is always 0 and numAtCurMin is initialized to k and is decremented
337 * down for each valid update until it reaches 0, where it stays. Thus, for these two
338 * isomorphs, when numAtCurMin = 0, means the true curMin is > 0 and the unique count must be
339 * greater than k.
340 *
341 * HLL4 always maintains both curMin and numAtCurMin dynamically. Nonetheless, the rules for
342 * the very small values <= k where curMin = 0 still apply.
343 */
344template<typename A>
345double HllArray<A>::getLowerBound(uint8_t numStdDev) const {
346 HllUtil<A>::checkNumStdDev(numStdDev);
347 const uint32_t configK = 1 << this->lgConfigK_;
348 const double numNonZeros = ((curMin_ == 0) ? (configK - numAtCurMin_) : configK);
349 const double relErr = HllUtil<A>::getRelErr(false, this->oooFlag_, this->lgConfigK_, numStdDev);
350 return fmax(getEstimate() / (1.0 + relErr), numNonZeros);
351}
352
353template<typename A>
354double HllArray<A>::getUpperBound(uint8_t numStdDev) const {
355 HllUtil<A>::checkNumStdDev(numStdDev);
356 const double relErr = HllUtil<A>::getRelErr(true, this->oooFlag_, this->lgConfigK_, numStdDev);
357 return getEstimate() / (1.0 + relErr);
358}
359
365// Original C: again-two-registers.c hhb_get_composite_estimate L1489
366template<typename A>
367double HllArray<A>::getCompositeEstimate() const {
368 const double rawEst = getHllRawEstimate();
369
370 const double* xArr = CompositeInterpolationXTable<A>::get_x_arr(this->lgConfigK_);
371 const uint32_t xArrLen = CompositeInterpolationXTable<A>::get_x_arr_length();
372 const double yStride = CompositeInterpolationXTable<A>::get_y_stride(this->lgConfigK_);
373
374 if (rawEst < xArr[0]) {
375 return 0;
376 }
377
378 const uint32_t xArrLenM1 = xArrLen - 1;
379
380 if (rawEst > xArr[xArrLenM1]) {
381 const double finalY = yStride * xArrLenM1;
382 const double factor = finalY / xArr[xArrLenM1];
383 return rawEst * factor;
384 }
385
386 double adjEst = CubicInterpolation<A>::usingXArrAndYStride(xArr, xArrLen, yStride, rawEst);
387
388 // We need to completely avoid the linear_counting estimator if it might have a crazy value.
389 // Empirical evidence suggests that the threshold 3*k will keep us safe if 2^4 <= k <= 2^21.
390
391 if (adjEst > (3 << this->lgConfigK_)) { return adjEst; }
392
393 const double linEst = getHllBitMapEstimate();
394
395 // Bias is created when the value of an estimator is compared with a threshold to decide whether
396 // to use that estimator or a different one.
397 // We conjecture that less bias is created when the average of the two estimators
398 // is compared with the threshold. Empirical measurements support this conjecture.
399
400 const double avgEst = (adjEst + linEst) / 2.0;
401
402 // The following constants comes from empirical measurements of the crossover point
403 // between the average error of the linear estimator and the adjusted hll estimator
404 double crossOver = 0.64;
405 if (this->lgConfigK_ == 4) { crossOver = 0.718; }
406 else if (this->lgConfigK_ == 5) { crossOver = 0.672; }
407
408 return (avgEst > (crossOver * (1 << this->lgConfigK_))) ? adjEst : linEst;
409}
410
411template<typename A>
412double HllArray<A>::getKxQ0() const {
413 return kxq0_;
414}
415
416template<typename A>
417double HllArray<A>::getKxQ1() const {
418 return kxq1_;
419}
420
421template<typename A>
422double HllArray<A>::getHipAccum() const {
423 return hipAccum_;
424}
425
426template<typename A>
427uint8_t HllArray<A>::getCurMin() const {
428 return curMin_;
429}
430
431template<typename A>
432uint32_t HllArray<A>::getNumAtCurMin() const {
433 return numAtCurMin_;
434}
435
436template<typename A>
437void HllArray<A>::putKxQ0(double kxq0) {
438 kxq0_ = kxq0;
439}
440
441template<typename A>
442void HllArray<A>::putKxQ1(double kxq1) {
443 kxq1_ = kxq1;
444}
445
446template<typename A>
447void HllArray<A>::putHipAccum(double hipAccum) {
448 hipAccum_ = hipAccum;
449}
450
451template<typename A>
452void HllArray<A>::putCurMin(uint8_t curMin) {
453 curMin_ = curMin;
454}
455
456template<typename A>
457void HllArray<A>::putNumAtCurMin(uint32_t numAtCurMin) {
458 numAtCurMin_ = numAtCurMin;
459}
460
461template<typename A>
462bool HllArray<A>::isCompact() const {
463 return false;
464}
465
466template<typename A>
467bool HllArray<A>::isEmpty() const {
468 const uint32_t configK = 1 << this->lgConfigK_;
469 return (curMin_ == 0) && (numAtCurMin_ == configK);
470}
471
472template<typename A>
473void HllArray<A>::putOutOfOrderFlag(bool flag) {
474 oooFlag_ = flag;
475}
476
477template<typename A>
478bool HllArray<A>::isOutOfOrderFlag() const {
479 return oooFlag_;
480}
481
482template<typename A>
483uint32_t HllArray<A>::hllArrBytes(target_hll_type tgtHllType, uint8_t lgConfigK) {
484 switch (tgtHllType) {
485 case HLL_4:
486 return hll4ArrBytes(lgConfigK);
487 case HLL_6:
488 return hll6ArrBytes(lgConfigK);
489 case HLL_8:
490 return hll8ArrBytes(lgConfigK);
491 default:
492 throw std::invalid_argument("Invalid target HLL type");
493 }
494}
495
496template<typename A>
497uint32_t HllArray<A>::hll4ArrBytes(uint8_t lgConfigK) {
498 return 1 << (lgConfigK - 1);
499}
500
501template<typename A>
502uint32_t HllArray<A>::hll6ArrBytes(uint8_t lgConfigK) {
503 const uint32_t numSlots = 1 << lgConfigK;
504 return ((numSlots * 3) >> 2) + 1;
505}
506
507template<typename A>
508uint32_t HllArray<A>::hll8ArrBytes(uint8_t lgConfigK) {
509 return 1 << lgConfigK;
510}
511
512template<typename A>
513uint32_t HllArray<A>::getMemDataStart() const {
514 return hll_constants::HLL_BYTE_ARR_START;
515}
516
517template<typename A>
518uint32_t HllArray<A>::getUpdatableSerializationBytes() const {
519 return hll_constants::HLL_BYTE_ARR_START + getHllByteArrBytes();
520}
521
522template<typename A>
523uint32_t HllArray<A>::getCompactSerializationBytes() const {
524 AuxHashMap<A>* auxHashMap = getAuxHashMap();
525 const uint32_t auxCountBytes = ((auxHashMap == nullptr) ? 0 : auxHashMap->getCompactSizeBytes());
526 return hll_constants::HLL_BYTE_ARR_START + getHllByteArrBytes() + auxCountBytes;
527}
528
529template<typename A>
530uint8_t HllArray<A>::getPreInts() const {
531 return hll_constants::HLL_PREINTS;
532}
533
534template<typename A>
535AuxHashMap<A>* HllArray<A>::getAuxHashMap() const {
536 return nullptr;
537}
538
539template<typename A>
540auto HllArray<A>::getHllArray() const -> const vector_bytes& {
541 return hllByteArr_;
542}
543
544template<typename A>
545void HllArray<A>::hipAndKxQIncrementalUpdate(uint8_t oldValue, uint8_t newValue) {
546 const uint32_t configK = 1 << this->getLgConfigK();
547 // update hip BEFORE updating kxq
548 if (!oooFlag_) hipAccum_ += configK / (kxq0_ + kxq1_);
549 // update kxq0 and kxq1; subtract first, then add
550 if (oldValue < 32) { kxq0_ -= INVERSE_POWERS_OF_2[oldValue]; }
551 else { kxq1_ -= INVERSE_POWERS_OF_2[oldValue]; }
552 if (newValue < 32) { kxq0_ += INVERSE_POWERS_OF_2[newValue]; }
553 else { kxq1_ += INVERSE_POWERS_OF_2[newValue]; }
554}
555
561//In C: again-two-registers.c hhb_get_improved_linear_counting_estimate L1274
562template<typename A>
563double HllArray<A>::getHllBitMapEstimate() const {
564 const uint32_t configK = 1 << this->lgConfigK_;
565 const uint32_t numUnhitBuckets = curMin_ == 0 ? numAtCurMin_ : 0;
566
567 //This will eventually go away.
568 if (numUnhitBuckets == 0) {
569 return configK * log(configK / 0.5);
570 }
571
572 const uint32_t numHitBuckets = configK - numUnhitBuckets;
573 return HarmonicNumbers<A>::getBitMapEstimate(configK, numHitBuckets);
574}
575
576//In C: again-two-registers.c hhb_get_raw_estimate L1167
577template<typename A>
578double HllArray<A>::getHllRawEstimate() const {
579 const uint32_t configK = 1 << this->lgConfigK_;
580 double correctionFactor;
581 if (this->lgConfigK_ == 4) { correctionFactor = 0.673; }
582 else if (this->lgConfigK_ == 5) { correctionFactor = 0.697; }
583 else if (this->lgConfigK_ == 6) { correctionFactor = 0.709; }
584 else { correctionFactor = 0.7213 / (1.0 + (1.079 / configK)); }
585 const double hyperEst = (correctionFactor * configK * configK) / (kxq0_ + kxq1_);
586 return hyperEst;
587}
588
589template<typename A>
590void HllArray<A>::setRebuildKxqCurminFlag(bool rebuild) {
591 rebuild_kxq_curmin_ = rebuild;
592}
593
594template<typename A>
595bool HllArray<A>::isRebuildKxqCurminFlag() const {
596 return rebuild_kxq_curmin_;
597}
598
599template<typename A>
600void HllArray<A>::check_rebuild_kxq_cur_min() {
601 if (!rebuild_kxq_curmin_) { return; }
602
603 uint8_t cur_min = 64;
604 uint32_t num_at_cur_min = 0;
605 double kxq0 = 1 << this->lgConfigK_;
606 double kxq1 = 0;
607
608 auto it = this->begin(true); // want all points to adjust cur_min
609 const auto end = this->end();
610 while (it != end) {
611 uint8_t v = HllUtil<A>::getValue(*it);
612 if (v > 0) {
613 if (v < 32) { kxq0 += INVERSE_POWERS_OF_2[v] - 1.0; }
614 else { kxq1 += INVERSE_POWERS_OF_2[v] - 1.0; }
615 }
616 if (v > cur_min) { ++it; continue; }
617 if (v < cur_min) {
618 cur_min = v;
619 num_at_cur_min = 1;
620 } else {
621 ++num_at_cur_min;
622 }
623 ++it;
624 }
625
626 kxq0_ = kxq0;
627 kxq1_ = kxq1;
628 curMin_ = cur_min;
629 numAtCurMin_ = num_at_cur_min;
630 rebuild_kxq_curmin_ = false;
631 // HipAccum is not affected
632
633}
634
635template<typename A>
636typename HllArray<A>::const_iterator HllArray<A>::begin(bool all) const {
637 return const_iterator(hllByteArr_.data(), 1 << this->lgConfigK_, 0, this->tgtHllType_, nullptr, 0, all);
638}
639
640template<typename A>
641typename HllArray<A>::const_iterator HllArray<A>::end() const {
642 return const_iterator(hllByteArr_.data(), 1 << this->lgConfigK_, 1 << this->lgConfigK_, this->tgtHllType_, nullptr, 0, false);
643}
644
645template<typename A>
646HllArray<A>::const_iterator::const_iterator(const uint8_t* array, uint32_t array_size, uint32_t index, target_hll_type hll_type, const AuxHashMap<A>* exceptions, uint8_t offset, bool all):
647array_(array), array_size_(array_size), index_(index), hll_type_(hll_type), exceptions_(exceptions), offset_(offset), all_(all)
648{
649 while (index_ < array_size_) {
650 value_ = get_value(array_, index_, hll_type_, exceptions_, offset_);
651 if (all_ || value_ != hll_constants::EMPTY) break;
652 ++index_;
653 }
654}
655
656template<typename A>
657typename HllArray<A>::const_iterator& HllArray<A>::const_iterator::operator++() {
658 while (++index_ < array_size_) {
659 value_ = get_value(array_, index_, hll_type_, exceptions_, offset_);
660 if (all_ || value_ != hll_constants::EMPTY) break;
661 }
662 return *this;
663}
664
665template<typename A>
666bool HllArray<A>::const_iterator::operator!=(const const_iterator& other) const {
667 return index_ != other.index_;
668}
669
670template<typename A>
671auto HllArray<A>::const_iterator::operator*() const -> reference {
672 return HllUtil<A>::pair(index_, value_);
673}
674
675template<typename A>
676uint8_t HllArray<A>::const_iterator::get_value(const uint8_t* array, uint32_t index, target_hll_type hll_type, const AuxHashMap<A>* exceptions, uint8_t offset) {
677 // TODO: we should be able to improve efficiency here by reading multiple bytes at a time
678 // for HLL4 and HLL6
679 if (hll_type == target_hll_type::HLL_4) {
680 uint8_t value = array[index >> 1];
681 if ((index & 1) > 0) { // odd
682 value >>= 4;
683 } else {
684 value &= hll_constants::loNibbleMask;
685 }
686 if (value == hll_constants::AUX_TOKEN) { // exception
687 return exceptions->mustFindValueFor(index);
688 }
689 return value + offset;
690 } else if (hll_type == target_hll_type::HLL_6) {
691 const size_t start_bit = index * 6;
692 const uint8_t shift = start_bit & 0x7;
693 const size_t byte_idx = start_bit >> 3;
694 const uint16_t two_byte_val = (array[byte_idx + 1] << 8) | array[byte_idx];
695 return (two_byte_val >> shift) & hll_constants::VAL_MASK_6;
696 }
697 // HLL_8
698 return array[index];
699}
700
701template<typename A>
702A HllArray<A>::getAllocator() const {
703 return hllByteArr_.get_allocator();
704}
705
706}
707
708#endif // _HLLARRAY_INTERNAL_HPP_
DataSketches namespace.
Definition binomial_bounds.hpp:38
target_hll_type
Specifies the target type of HLL sketch to be created.
Definition hll.hpp:72
@ HLL_6
6 bits per entry (fixed size)
Definition hll.hpp:74
@ HLL_8
8 bits per entry (fastest, fixed size)
Definition hll.hpp:75
@ HLL_4
4 bits per entry (most compact, size may vary)
Definition hll.hpp:73