datasketches-cpp
HllArray-internal.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef _HLLARRAY_INTERNAL_HPP_
21 #define _HLLARRAY_INTERNAL_HPP_
22 
23 #include "HllArray.hpp"
24 #include "HllUtil.hpp"
25 #include "HarmonicNumbers.hpp"
26 #include "CubicInterpolation.hpp"
27 #include "CompositeInterpolationXTable.hpp"
28 #include "CouponList.hpp"
29 #include "inv_pow2_table.hpp"
30 #include <cstring>
31 #include <cmath>
32 #include <stdexcept>
33 #include <string>
34 
35 namespace datasketches {
36 
37 template<typename A>
38 HllArray<A>::HllArray(uint8_t lgConfigK, target_hll_type tgtHllType, bool startFullSize, const A& allocator):
39 HllSketchImpl<A>(lgConfigK, tgtHllType, hll_mode::HLL, startFullSize),
40 hipAccum_(0.0),
41 kxq0_(1 << lgConfigK),
42 kxq1_(0.0),
43 hllByteArr_(allocator),
44 curMin_(0),
45 numAtCurMin_(1 << lgConfigK),
46 oooFlag_(false),
47 rebuild_kxq_curmin_(false)
48 {}
49 
50 template<typename A>
51 HllArray<A>::HllArray(const HllArray& other, target_hll_type tgtHllType) :
52  HllSketchImpl<A>(other.getLgConfigK(), tgtHllType, hll_mode::HLL, other.isStartFullSize()),
53  // remaining fields are initialized to empty sketch defaults
54  // and left to subclass constructor to populate
55  hipAccum_(0.0),
56  kxq0_(1 << other.getLgConfigK()),
57  kxq1_(0.0),
58  hllByteArr_(other.getAllocator()),
59  curMin_(0),
60  numAtCurMin_(1 << other.getLgConfigK()),
61  oooFlag_(false),
62  rebuild_kxq_curmin_(false)
63 {}
64 
65 template<typename A>
66 HllArray<A>* HllArray<A>::copyAs(target_hll_type tgtHllType) const {
67  // we may need to recompute KxQ and curMin data for a union gadget,
68  // so only use a direct copy if we have a valid sketch
69  if (tgtHllType == this->getTgtHllType() && !this->isRebuildKxqCurminFlag()) {
70  return static_cast<HllArray*>(copy());
71  }
72 
73  // the factory methods replay the coupons and will always rebuild
74  // the sketch in a consistent way
75  switch (tgtHllType) {
76  case target_hll_type::HLL_4:
77  return HllSketchImplFactory<A>::convertToHll4(*this);
78  case target_hll_type::HLL_6:
79  return HllSketchImplFactory<A>::convertToHll6(*this);
80  case target_hll_type::HLL_8:
81  return HllSketchImplFactory<A>::convertToHll8(*this);
82  default:
83  throw std::invalid_argument("Invalid target HLL type");
84  }
85 }
86 
87 template<typename A>
88 HllArray<A>* HllArray<A>::newHll(const void* bytes, size_t len, const A& allocator) {
89  if (len < hll_constants::HLL_BYTE_ARR_START) {
90  throw std::out_of_range("Input data length insufficient to hold HLL array");
91  }
92 
93  const uint8_t* data = static_cast<const uint8_t*>(bytes);
94  if (data[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::HLL_PREINTS) {
95  throw std::invalid_argument("Incorrect number of preInts in input stream");
96  }
97  if (data[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
98  throw std::invalid_argument("Wrong ser ver in input stream");
99  }
100  if (data[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
101  throw std::invalid_argument("Input array is not an HLL sketch");
102  }
103 
104  const hll_mode mode = HllSketchImpl<A>::extractCurMode(data[hll_constants::MODE_BYTE]);
105  if (mode != HLL) {
106  throw std::invalid_argument("Calling HLL array constructor with non-HLL mode data");
107  }
108 
109  const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(data[hll_constants::MODE_BYTE]);
110  const bool oooFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::OUT_OF_ORDER_FLAG_MASK) ? true : false);
111  const bool comapctFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
112  const bool startFullSizeFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::FULL_SIZE_FLAG_MASK) ? true : false);
113 
114  const uint8_t lgK = data[hll_constants::LG_K_BYTE];
115  const uint8_t curMin = data[hll_constants::HLL_CUR_MIN_BYTE];
116 
117  const uint32_t arrayBytes = hllArrBytes(tgtHllType, lgK);
118  if (len < static_cast<size_t>(hll_constants::HLL_BYTE_ARR_START + arrayBytes)) {
119  throw std::out_of_range("Input array too small to hold sketch image");
120  }
121 
122  double hip, kxq0, kxq1;
123  std::memcpy(&hip, data + hll_constants::HIP_ACCUM_DOUBLE, sizeof(double));
124  std::memcpy(&kxq0, data + hll_constants::KXQ0_DOUBLE, sizeof(double));
125  std::memcpy(&kxq1, data + hll_constants::KXQ1_DOUBLE, sizeof(double));
126 
127  uint32_t numAtCurMin, auxCount;
128  std::memcpy(&numAtCurMin, data + hll_constants::CUR_MIN_COUNT_INT, sizeof(int));
129  std::memcpy(&auxCount, data + hll_constants::AUX_COUNT_INT, sizeof(int));
130 
131  AuxHashMap<A>* auxHashMap = nullptr;
132  typedef std::unique_ptr<AuxHashMap<A>, std::function<void(AuxHashMap<A>*)>> aux_hash_map_ptr;
133  aux_hash_map_ptr aux_ptr;
134  if (auxCount > 0) { // necessarily TgtHllType == HLL_4
135  uint8_t auxLgIntArrSize = data[4];
136  const size_t offset = hll_constants::HLL_BYTE_ARR_START + arrayBytes;
137  const uint8_t* auxDataStart = data + offset;
138  auxHashMap = AuxHashMap<A>::deserialize(auxDataStart, len - offset, lgK, auxCount, auxLgIntArrSize, comapctFlag, allocator);
139  aux_ptr = aux_hash_map_ptr(auxHashMap, auxHashMap->make_deleter());
140  }
141 
142  HllArray<A>* sketch = HllSketchImplFactory<A>::newHll(lgK, tgtHllType, startFullSizeFlag, allocator);
143  sketch->putCurMin(curMin);
144  sketch->putOutOfOrderFlag(oooFlag);
145  if (!oooFlag) sketch->putHipAccum(hip);
146  sketch->putKxQ0(kxq0);
147  sketch->putKxQ1(kxq1);
148  sketch->putNumAtCurMin(numAtCurMin);
149 
150  std::memcpy(sketch->hllByteArr_.data(), data + hll_constants::HLL_BYTE_ARR_START, arrayBytes);
151 
152  if (auxHashMap != nullptr)
153  ((Hll4Array<A>*)sketch)->putAuxHashMap(auxHashMap);
154 
155  aux_ptr.release();
156  return sketch;
157 }
158 
159 template<typename A>
160 HllArray<A>* HllArray<A>::newHll(std::istream& is, const A& allocator) {
161  uint8_t listHeader[8];
162  read(is, listHeader, 8 * sizeof(uint8_t));
163 
164  if (listHeader[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::HLL_PREINTS) {
165  throw std::invalid_argument("Incorrect number of preInts in input stream");
166  }
167  if (listHeader[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
168  throw std::invalid_argument("Wrong ser ver in input stream");
169  }
170  if (listHeader[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
171  throw std::invalid_argument("Input stream is not an HLL sketch");
172  }
173 
174  hll_mode mode = HllSketchImpl<A>::extractCurMode(listHeader[hll_constants::MODE_BYTE]);
175  if (mode != HLL) {
176  throw std::invalid_argument("Calling HLL construtor with non-HLL mode data");
177  }
178 
179  const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(listHeader[hll_constants::MODE_BYTE]);
180  const bool oooFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::OUT_OF_ORDER_FLAG_MASK) ? true : false);
181  const bool comapctFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
182  const bool startFullSizeFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::FULL_SIZE_FLAG_MASK) ? true : false);
183 
184  const uint8_t lgK = listHeader[hll_constants::LG_K_BYTE];
185  const uint8_t curMin = listHeader[hll_constants::HLL_CUR_MIN_BYTE];
186 
187  HllArray* sketch = HllSketchImplFactory<A>::newHll(lgK, tgtHllType, startFullSizeFlag, allocator);
188  typedef std::unique_ptr<HllArray<A>, std::function<void(HllSketchImpl<A>*)>> hll_array_ptr;
189  hll_array_ptr sketch_ptr(sketch, sketch->get_deleter());
190  sketch->putCurMin(curMin);
191  sketch->putOutOfOrderFlag(oooFlag);
192 
193  const auto hip = read<double>(is);
194  const auto kxq0 = read<double>(is);
195  const auto kxq1 = read<double>(is);
196  if (!oooFlag) sketch->putHipAccum(hip);
197  sketch->putKxQ0(kxq0);
198  sketch->putKxQ1(kxq1);
199 
200  const auto numAtCurMin = read<uint32_t>(is);
201  const auto auxCount = read<uint32_t>(is);
202  sketch->putNumAtCurMin(numAtCurMin);
203 
204  read(is, sketch->hllByteArr_.data(), sketch->getHllByteArrBytes());
205 
206  if (auxCount > 0) { // necessarily TgtHllType == HLL_4
207  uint8_t auxLgIntArrSize = listHeader[4];
208  AuxHashMap<A>* auxHashMap = AuxHashMap<A>::deserialize(is, lgK, auxCount, auxLgIntArrSize, comapctFlag, allocator);
209  ((Hll4Array<A>*)sketch)->putAuxHashMap(auxHashMap);
210  }
211 
212  if (!is.good())
213  throw std::runtime_error("error reading from std::istream");
214 
215  return sketch_ptr.release();
216 }
217 
218 template<typename A>
219 auto HllArray<A>::serialize(bool compact, unsigned header_size_bytes) const -> vector_bytes {
220  const size_t sketchSizeBytes = (compact ? getCompactSerializationBytes() : getUpdatableSerializationBytes()) + header_size_bytes;
221  vector_bytes byteArr(sketchSizeBytes, 0, getAllocator());
222  uint8_t* bytes = byteArr.data() + header_size_bytes;
223  AuxHashMap<A>* auxHashMap = getAuxHashMap();
224 
225  bytes[hll_constants::PREAMBLE_INTS_BYTE] = static_cast<uint8_t>(getPreInts());
226  bytes[hll_constants::SER_VER_BYTE] = static_cast<uint8_t>(hll_constants::SER_VER);
227  bytes[hll_constants::FAMILY_BYTE] = static_cast<uint8_t>(hll_constants::FAMILY_ID);
228  bytes[hll_constants::LG_K_BYTE] = static_cast<uint8_t>(this->lgConfigK_);
229  bytes[hll_constants::LG_ARR_BYTE] = static_cast<uint8_t>(auxHashMap == nullptr ? 0 : auxHashMap->getLgAuxArrInts());
230  bytes[hll_constants::FLAGS_BYTE] = this->makeFlagsByte(compact);
231  bytes[hll_constants::HLL_CUR_MIN_BYTE] = static_cast<uint8_t>(curMin_);
232  bytes[hll_constants::MODE_BYTE] = this->makeModeByte();
233 
234  std::memcpy(bytes + hll_constants::HIP_ACCUM_DOUBLE, &hipAccum_, sizeof(double));
235  std::memcpy(bytes + hll_constants::KXQ0_DOUBLE, &kxq0_, sizeof(double));
236  std::memcpy(bytes + hll_constants::KXQ1_DOUBLE, &kxq1_, sizeof(double));
237  std::memcpy(bytes + hll_constants::CUR_MIN_COUNT_INT, &numAtCurMin_, sizeof(uint32_t));
238  const uint32_t auxCount = (auxHashMap == nullptr ? 0 : auxHashMap->getAuxCount());
239  std::memcpy(bytes + hll_constants::AUX_COUNT_INT, &auxCount, sizeof(uint32_t));
240 
241  const uint32_t hllByteArrBytes = getHllByteArrBytes();
242  std::memcpy(bytes + getMemDataStart(), hllByteArr_.data(), hllByteArrBytes);
243 
244  // aux map if HLL_4
245  if (this->tgtHllType_ == HLL_4) {
246  bytes += getMemDataStart() + hllByteArrBytes; // start of auxHashMap
247  if (auxHashMap != nullptr) {
248  if (compact) {
249  for (const uint32_t coupon: *auxHashMap) {
250  std::memcpy(bytes, &coupon, sizeof(coupon));
251  bytes += sizeof(coupon);
252  }
253  } else {
254  std::memcpy(bytes, auxHashMap->getAuxIntArr(), auxHashMap->getUpdatableSizeBytes());
255  }
256  } else if (!compact) {
257  // if updatable, we write even if currently unused so the binary can be wrapped
258  uint32_t auxBytes = 4 << hll_constants::LG_AUX_ARR_INTS[this->lgConfigK_];
259  std::fill_n(bytes, auxBytes, static_cast<uint8_t>(0));
260  }
261  }
262 
263  return byteArr;
264 }
265 
266 template<typename A>
267 void HllArray<A>::serialize(std::ostream& os, bool compact) const {
268  // header
269  const uint8_t preInts = getPreInts();
270  write(os, preInts);
271  const uint8_t serialVersion = hll_constants::SER_VER;
272  write(os, serialVersion);
273  const uint8_t familyId = hll_constants::FAMILY_ID;
274  write(os, familyId);
275  const uint8_t lgKByte = this->lgConfigK_;
276  write(os, lgKByte);
277 
278  AuxHashMap<A>* auxHashMap = getAuxHashMap();
279  uint8_t lgArrByte = 0;
280  if (auxHashMap != nullptr) {
281  lgArrByte = auxHashMap->getLgAuxArrInts();
282  }
283  write(os, lgArrByte);
284 
285  const uint8_t flagsByte = this->makeFlagsByte(compact);
286  write(os, flagsByte);
287  write(os, curMin_);
288  const uint8_t modeByte = this->makeModeByte();
289  write(os, modeByte);
290 
291  // estimator data
292  write(os, hipAccum_);
293  write(os, kxq0_);
294  write(os, kxq1_);
295 
296  // array data
297  write(os, numAtCurMin_);
298 
299  const uint32_t auxCount = (auxHashMap == nullptr ? 0 : auxHashMap->getAuxCount());
300  write(os, auxCount);
301  write(os, hllByteArr_.data(), getHllByteArrBytes());
302 
303  // aux map if HLL_4
304  if (this->tgtHllType_ == HLL_4) {
305  if (auxHashMap != nullptr) {
306  if (compact) {
307  for (const uint32_t coupon: *auxHashMap) {
308  write(os, coupon);
309  }
310  } else {
311  write(os, auxHashMap->getAuxIntArr(), auxHashMap->getUpdatableSizeBytes());
312  }
313  } else if (!compact) {
314  // if updatable, we write even if currently unused so the binary can be wrapped
315  uint32_t auxBytes = 4 << hll_constants::LG_AUX_ARR_INTS[this->lgConfigK_];
316  std::fill_n(std::ostreambuf_iterator<char>(os), auxBytes, static_cast<char>(0));
317  }
318  }
319 }
320 
321 template<typename A>
322 double HllArray<A>::getEstimate() const {
323  if (oooFlag_) {
324  return getCompositeEstimate();
325  }
326  return hipAccum_;
327 }
328 
329 // HLL UPPER AND LOWER BOUNDS
330 
331 /*
332  * The upper and lower bounds are not symmetric and thus are treated slightly differently.
333  * For the lower bound, when the unique count is <= k, LB >= numNonZeros, where
334  * numNonZeros = k - numAtCurMin AND curMin == 0.
335  *
336  * For HLL6 and HLL8, curMin is always 0 and numAtCurMin is initialized to k and is decremented
337  * down for each valid update until it reaches 0, where it stays. Thus, for these two
338  * isomorphs, when numAtCurMin = 0, means the true curMin is > 0 and the unique count must be
339  * greater than k.
340  *
341  * HLL4 always maintains both curMin and numAtCurMin dynamically. Nonetheless, the rules for
342  * the very small values <= k where curMin = 0 still apply.
343  */
344 template<typename A>
345 double HllArray<A>::getLowerBound(uint8_t numStdDev) const {
346  HllUtil<A>::checkNumStdDev(numStdDev);
347  const uint32_t configK = 1 << this->lgConfigK_;
348  const double numNonZeros = ((curMin_ == 0) ? (configK - numAtCurMin_) : configK);
349  const double relErr = HllUtil<A>::getRelErr(false, this->oooFlag_, this->lgConfigK_, numStdDev);
350  return fmax(getEstimate() / (1.0 + relErr), numNonZeros);
351 }
352 
353 template<typename A>
354 double HllArray<A>::getUpperBound(uint8_t numStdDev) const {
355  HllUtil<A>::checkNumStdDev(numStdDev);
356  const double relErr = HllUtil<A>::getRelErr(true, this->oooFlag_, this->lgConfigK_, numStdDev);
357  return getEstimate() / (1.0 + relErr);
358 }
359 
365 // Original C: again-two-registers.c hhb_get_composite_estimate L1489
366 template<typename A>
367 double HllArray<A>::getCompositeEstimate() const {
368  const double rawEst = getHllRawEstimate();
369 
370  const double* xArr = CompositeInterpolationXTable<A>::get_x_arr(this->lgConfigK_);
371  const uint32_t xArrLen = CompositeInterpolationXTable<A>::get_x_arr_length();
372  const double yStride = CompositeInterpolationXTable<A>::get_y_stride(this->lgConfigK_);
373 
374  if (rawEst < xArr[0]) {
375  return 0;
376  }
377 
378  const uint32_t xArrLenM1 = xArrLen - 1;
379 
380  if (rawEst > xArr[xArrLenM1]) {
381  const double finalY = yStride * xArrLenM1;
382  const double factor = finalY / xArr[xArrLenM1];
383  return rawEst * factor;
384  }
385 
386  double adjEst = CubicInterpolation<A>::usingXArrAndYStride(xArr, xArrLen, yStride, rawEst);
387 
388  // We need to completely avoid the linear_counting estimator if it might have a crazy value.
389  // Empirical evidence suggests that the threshold 3*k will keep us safe if 2^4 <= k <= 2^21.
390 
391  if (adjEst > (3 << this->lgConfigK_)) { return adjEst; }
392 
393  const double linEst = getHllBitMapEstimate();
394 
395  // Bias is created when the value of an estimator is compared with a threshold to decide whether
396  // to use that estimator or a different one.
397  // We conjecture that less bias is created when the average of the two estimators
398  // is compared with the threshold. Empirical measurements support this conjecture.
399 
400  const double avgEst = (adjEst + linEst) / 2.0;
401 
402  // The following constants comes from empirical measurements of the crossover point
403  // between the average error of the linear estimator and the adjusted hll estimator
404  double crossOver = 0.64;
405  if (this->lgConfigK_ == 4) { crossOver = 0.718; }
406  else if (this->lgConfigK_ == 5) { crossOver = 0.672; }
407 
408  return (avgEst > (crossOver * (1 << this->lgConfigK_))) ? adjEst : linEst;
409 }
410 
411 template<typename A>
412 double HllArray<A>::getKxQ0() const {
413  return kxq0_;
414 }
415 
416 template<typename A>
417 double HllArray<A>::getKxQ1() const {
418  return kxq1_;
419 }
420 
421 template<typename A>
422 double HllArray<A>::getHipAccum() const {
423  return hipAccum_;
424 }
425 
426 template<typename A>
427 uint8_t HllArray<A>::getCurMin() const {
428  return curMin_;
429 }
430 
431 template<typename A>
432 uint32_t HllArray<A>::getNumAtCurMin() const {
433  return numAtCurMin_;
434 }
435 
436 template<typename A>
437 void HllArray<A>::putKxQ0(double kxq0) {
438  kxq0_ = kxq0;
439 }
440 
441 template<typename A>
442 void HllArray<A>::putKxQ1(double kxq1) {
443  kxq1_ = kxq1;
444 }
445 
446 template<typename A>
447 void HllArray<A>::putHipAccum(double hipAccum) {
448  hipAccum_ = hipAccum;
449 }
450 
451 template<typename A>
452 void HllArray<A>::putCurMin(uint8_t curMin) {
453  curMin_ = curMin;
454 }
455 
456 template<typename A>
457 void HllArray<A>::putNumAtCurMin(uint32_t numAtCurMin) {
458  numAtCurMin_ = numAtCurMin;
459 }
460 
461 template<typename A>
462 bool HllArray<A>::isCompact() const {
463  return false;
464 }
465 
466 template<typename A>
467 bool HllArray<A>::isEmpty() const {
468  const uint32_t configK = 1 << this->lgConfigK_;
469  return (curMin_ == 0) && (numAtCurMin_ == configK);
470 }
471 
472 template<typename A>
473 void HllArray<A>::putOutOfOrderFlag(bool flag) {
474  oooFlag_ = flag;
475 }
476 
477 template<typename A>
478 bool HllArray<A>::isOutOfOrderFlag() const {
479  return oooFlag_;
480 }
481 
482 template<typename A>
483 uint32_t HllArray<A>::hllArrBytes(target_hll_type tgtHllType, uint8_t lgConfigK) {
484  switch (tgtHllType) {
485  case HLL_4:
486  return hll4ArrBytes(lgConfigK);
487  case HLL_6:
488  return hll6ArrBytes(lgConfigK);
489  case HLL_8:
490  return hll8ArrBytes(lgConfigK);
491  default:
492  throw std::invalid_argument("Invalid target HLL type");
493  }
494 }
495 
496 template<typename A>
497 uint32_t HllArray<A>::hll4ArrBytes(uint8_t lgConfigK) {
498  return 1 << (lgConfigK - 1);
499 }
500 
501 template<typename A>
502 uint32_t HllArray<A>::hll6ArrBytes(uint8_t lgConfigK) {
503  const uint32_t numSlots = 1 << lgConfigK;
504  return ((numSlots * 3) >> 2) + 1;
505 }
506 
507 template<typename A>
508 uint32_t HllArray<A>::hll8ArrBytes(uint8_t lgConfigK) {
509  return 1 << lgConfigK;
510 }
511 
512 template<typename A>
513 uint32_t HllArray<A>::getMemDataStart() const {
514  return hll_constants::HLL_BYTE_ARR_START;
515 }
516 
517 template<typename A>
518 uint32_t HllArray<A>::getUpdatableSerializationBytes() const {
519  return hll_constants::HLL_BYTE_ARR_START + getHllByteArrBytes();
520 }
521 
522 template<typename A>
523 uint32_t HllArray<A>::getCompactSerializationBytes() const {
524  AuxHashMap<A>* auxHashMap = getAuxHashMap();
525  const uint32_t auxCountBytes = ((auxHashMap == nullptr) ? 0 : auxHashMap->getCompactSizeBytes());
526  return hll_constants::HLL_BYTE_ARR_START + getHllByteArrBytes() + auxCountBytes;
527 }
528 
529 template<typename A>
530 uint8_t HllArray<A>::getPreInts() const {
531  return hll_constants::HLL_PREINTS;
532 }
533 
534 template<typename A>
535 AuxHashMap<A>* HllArray<A>::getAuxHashMap() const {
536  return nullptr;
537 }
538 
539 template<typename A>
540 auto HllArray<A>::getHllArray() const -> const vector_bytes& {
541  return hllByteArr_;
542 }
543 
544 template<typename A>
545 void HllArray<A>::hipAndKxQIncrementalUpdate(uint8_t oldValue, uint8_t newValue) {
546  const uint32_t configK = 1 << this->getLgConfigK();
547  // update hip BEFORE updating kxq
548  if (!oooFlag_) hipAccum_ += configK / (kxq0_ + kxq1_);
549  // update kxq0 and kxq1; subtract first, then add
550  if (oldValue < 32) { kxq0_ -= INVERSE_POWERS_OF_2[oldValue]; }
551  else { kxq1_ -= INVERSE_POWERS_OF_2[oldValue]; }
552  if (newValue < 32) { kxq0_ += INVERSE_POWERS_OF_2[newValue]; }
553  else { kxq1_ += INVERSE_POWERS_OF_2[newValue]; }
554 }
555 
561 //In C: again-two-registers.c hhb_get_improved_linear_counting_estimate L1274
562 template<typename A>
563 double HllArray<A>::getHllBitMapEstimate() const {
564  const uint32_t configK = 1 << this->lgConfigK_;
565  const uint32_t numUnhitBuckets = curMin_ == 0 ? numAtCurMin_ : 0;
566 
567  //This will eventually go away.
568  if (numUnhitBuckets == 0) {
569  return configK * log(configK / 0.5);
570  }
571 
572  const uint32_t numHitBuckets = configK - numUnhitBuckets;
573  return HarmonicNumbers<A>::getBitMapEstimate(configK, numHitBuckets);
574 }
575 
576 //In C: again-two-registers.c hhb_get_raw_estimate L1167
577 template<typename A>
578 double HllArray<A>::getHllRawEstimate() const {
579  const uint32_t configK = 1 << this->lgConfigK_;
580  double correctionFactor;
581  if (this->lgConfigK_ == 4) { correctionFactor = 0.673; }
582  else if (this->lgConfigK_ == 5) { correctionFactor = 0.697; }
583  else if (this->lgConfigK_ == 6) { correctionFactor = 0.709; }
584  else { correctionFactor = 0.7213 / (1.0 + (1.079 / configK)); }
585  const double hyperEst = (correctionFactor * configK * configK) / (kxq0_ + kxq1_);
586  return hyperEst;
587 }
588 
589 template<typename A>
590 void HllArray<A>::setRebuildKxqCurminFlag(bool rebuild) {
591  rebuild_kxq_curmin_ = rebuild;
592 }
593 
594 template<typename A>
595 bool HllArray<A>::isRebuildKxqCurminFlag() const {
596  return rebuild_kxq_curmin_;
597 }
598 
599 template<typename A>
600 void HllArray<A>::check_rebuild_kxq_cur_min() {
601  if (!rebuild_kxq_curmin_) { return; }
602 
603  uint8_t cur_min = 64;
604  uint32_t num_at_cur_min = 0;
605  double kxq0 = 1 << this->lgConfigK_;
606  double kxq1 = 0;
607 
608  auto it = this->begin(true); // want all points to adjust cur_min
609  const auto end = this->end();
610  while (it != end) {
611  uint8_t v = HllUtil<A>::getValue(*it);
612  if (v > 0) {
613  if (v < 32) { kxq0 += INVERSE_POWERS_OF_2[v] - 1.0; }
614  else { kxq1 += INVERSE_POWERS_OF_2[v] - 1.0; }
615  }
616  if (v > cur_min) { ++it; continue; }
617  if (v < cur_min) {
618  cur_min = v;
619  num_at_cur_min = 1;
620  } else {
621  ++num_at_cur_min;
622  }
623  ++it;
624  }
625 
626  kxq0_ = kxq0;
627  kxq1_ = kxq1;
628  curMin_ = cur_min;
629  numAtCurMin_ = num_at_cur_min;
630  rebuild_kxq_curmin_ = false;
631  // HipAccum is not affected
632 
633 }
634 
635 template<typename A>
636 typename HllArray<A>::const_iterator HllArray<A>::begin(bool all) const {
637  return const_iterator(hllByteArr_.data(), 1 << this->lgConfigK_, 0, this->tgtHllType_, nullptr, 0, all);
638 }
639 
640 template<typename A>
641 typename HllArray<A>::const_iterator HllArray<A>::end() const {
642  return const_iterator(hllByteArr_.data(), 1 << this->lgConfigK_, 1 << this->lgConfigK_, this->tgtHllType_, nullptr, 0, false);
643 }
644 
645 template<typename A>
646 HllArray<A>::const_iterator::const_iterator(const uint8_t* array, uint32_t array_size, uint32_t index, target_hll_type hll_type, const AuxHashMap<A>* exceptions, uint8_t offset, bool all):
647 array_(array), array_size_(array_size), index_(index), hll_type_(hll_type), exceptions_(exceptions), offset_(offset), all_(all)
648 {
649  while (index_ < array_size_) {
650  value_ = get_value(array_, index_, hll_type_, exceptions_, offset_);
651  if (all_ || value_ != hll_constants::EMPTY) break;
652  ++index_;
653  }
654 }
655 
656 template<typename A>
657 typename HllArray<A>::const_iterator& HllArray<A>::const_iterator::operator++() {
658  while (++index_ < array_size_) {
659  value_ = get_value(array_, index_, hll_type_, exceptions_, offset_);
660  if (all_ || value_ != hll_constants::EMPTY) break;
661  }
662  return *this;
663 }
664 
665 template<typename A>
666 bool HllArray<A>::const_iterator::operator!=(const const_iterator& other) const {
667  return index_ != other.index_;
668 }
669 
670 template<typename A>
671 auto HllArray<A>::const_iterator::operator*() const -> reference {
672  return HllUtil<A>::pair(index_, value_);
673 }
674 
675 template<typename A>
676 uint8_t HllArray<A>::const_iterator::get_value(const uint8_t* array, uint32_t index, target_hll_type hll_type, const AuxHashMap<A>* exceptions, uint8_t offset) {
677  // TODO: we should be able to improve efficiency here by reading multiple bytes at a time
678  // for HLL4 and HLL6
679  if (hll_type == target_hll_type::HLL_4) {
680  uint8_t value = array[index >> 1];
681  if ((index & 1) > 0) { // odd
682  value >>= 4;
683  } else {
684  value &= hll_constants::loNibbleMask;
685  }
686  if (value == hll_constants::AUX_TOKEN) { // exception
687  return exceptions->mustFindValueFor(index);
688  }
689  return value + offset;
690  } else if (hll_type == target_hll_type::HLL_6) {
691  const size_t start_bit = index * 6;
692  const uint8_t shift = start_bit & 0x7;
693  const size_t byte_idx = start_bit >> 3;
694  const uint16_t two_byte_val = (array[byte_idx + 1] << 8) | array[byte_idx];
695  return (two_byte_val >> shift) & hll_constants::VAL_MASK_6;
696  }
697  // HLL_8
698  return array[index];
699 }
700 
701 template<typename A>
702 A HllArray<A>::getAllocator() const {
703  return hllByteArr_.get_allocator();
704 }
705 
706 }
707 
708 #endif // _HLLARRAY_INTERNAL_HPP_
DataSketches namespace.
Definition: binomial_bounds.hpp:38
target_hll_type
Specifies the target type of HLL sketch to be created.
Definition: hll.hpp:72
@ HLL_6
6 bits per entry (fixed size)
Definition: hll.hpp:74
@ HLL_8
8 bits per entry (fastest, fixed size)
Definition: hll.hpp:75
@ HLL_4
4 bits per entry (most compact, size may vary)
Definition: hll.hpp:73