datasketches-cpp
CouponList-internal.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef _COUPONLIST_INTERNAL_HPP_
21 #define _COUPONLIST_INTERNAL_HPP_
22 
23 #include "CouponList.hpp"
24 #include "CubicInterpolation.hpp"
25 #include "HllUtil.hpp"
26 #include "count_zeros.hpp"
27 
28 #include <algorithm>
29 #include <cmath>
30 #include <stdexcept>
31 
32 namespace datasketches {
33 
34 template<typename A>
35 CouponList<A>::CouponList(uint8_t lgConfigK, target_hll_type tgtHllType, hll_mode mode, const A& allocator):
36 HllSketchImpl<A>(lgConfigK, tgtHllType, mode, false),
37 couponCount_(0),
38 oooFlag_(false),
39 coupons_(1ULL << (mode == hll_mode::LIST ? hll_constants::LG_INIT_LIST_SIZE : hll_constants::LG_INIT_SET_SIZE), 0, allocator)
40 {}
41 
42 template<typename A>
43 CouponList<A>::CouponList(const CouponList& that, const target_hll_type tgtHllType):
44 HllSketchImpl<A>(that.lgConfigK_, tgtHllType, that.mode_, false),
45 couponCount_(that.couponCount_),
46 oooFlag_(that.oooFlag_),
47 coupons_(that.coupons_)
48 {}
49 
50 template<typename A>
51 std::function<void(HllSketchImpl<A>*)> CouponList<A>::get_deleter() const {
52  return [](HllSketchImpl<A>* ptr) {
53  CouponList<A>* cl = static_cast<CouponList<A>*>(ptr);
54  ClAlloc cla(cl->getAllocator());
55  cl->~CouponList();
56  cla.deallocate(cl, 1);
57  };
58 }
59 
60 template<typename A>
61 CouponList<A>* CouponList<A>::copy() const {
62  ClAlloc cla(coupons_.get_allocator());
63  return new (cla.allocate(1)) CouponList<A>(*this);
64 }
65 
66 template<typename A>
67 CouponList<A>* CouponList<A>::copyAs(target_hll_type tgtHllType) const {
68  ClAlloc cla(coupons_.get_allocator());
69  return new (cla.allocate(1)) CouponList<A>(*this, tgtHllType);
70 }
71 
72 template<typename A>
73 CouponList<A>* CouponList<A>::newList(const void* bytes, size_t len, const A& allocator) {
74  if (len < hll_constants::LIST_INT_ARR_START) {
75  throw std::out_of_range("Input data length insufficient to hold CouponHashSet");
76  }
77 
78  const uint8_t* data = static_cast<const uint8_t*>(bytes);
79  if (data[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::LIST_PREINTS) {
80  throw std::invalid_argument("Incorrect number of preInts in input stream");
81  }
82  if (data[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
83  throw std::invalid_argument("Wrong ser ver in input stream");
84  }
85  if (data[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
86  throw std::invalid_argument("Input stream is not an HLL sketch");
87  }
88 
89  hll_mode mode = HllSketchImpl<A>::extractCurMode(data[hll_constants::MODE_BYTE]);
90  if (mode != LIST) {
91  throw std::invalid_argument("Calling list constructor with non-list mode data");
92  }
93 
94  target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(data[hll_constants::MODE_BYTE]);
95 
96  const uint8_t lgK = data[hll_constants::LG_K_BYTE];
97  const bool compact = ((data[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
98  const bool oooFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::OUT_OF_ORDER_FLAG_MASK) ? true : false);
99  const bool emptyFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::EMPTY_FLAG_MASK) ? true : false);
100 
101  const uint32_t couponCount = data[hll_constants::LIST_COUNT_BYTE];
102  const uint32_t couponsInArray = (compact ? couponCount : (1 << HllUtil<A>::computeLgArrInts(LIST, couponCount, lgK)));
103  const size_t expectedLength = hll_constants::LIST_INT_ARR_START + (couponsInArray * sizeof(uint32_t));
104  if (len < expectedLength) {
105  throw std::out_of_range("Byte array too short for sketch. Expected " + std::to_string(expectedLength)
106  + ", found: " + std::to_string(len));
107  }
108 
109  ClAlloc cla(allocator);
110  CouponList<A>* sketch = new (cla.allocate(1)) CouponList<A>(lgK, tgtHllType, mode, allocator);
111  sketch->couponCount_ = couponCount;
112  sketch->putOutOfOrderFlag(oooFlag); // should always be false for LIST
113 
114  if (!emptyFlag) {
115  // only need to read valid coupons, unlike in stream case
116  std::memcpy(sketch->coupons_.data(), data + hll_constants::LIST_INT_ARR_START, couponCount * sizeof(uint32_t));
117  }
118 
119  return sketch;
120 }
121 
122 template<typename A>
123 CouponList<A>* CouponList<A>::newList(std::istream& is, const A& allocator) {
124  uint8_t listHeader[8];
125  read(is, listHeader, 8 * sizeof(uint8_t));
126 
127  if (listHeader[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::LIST_PREINTS) {
128  throw std::invalid_argument("Incorrect number of preInts in input stream");
129  }
130  if (listHeader[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
131  throw std::invalid_argument("Wrong ser ver in input stream");
132  }
133  if (listHeader[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
134  throw std::invalid_argument("Input stream is not an HLL sketch");
135  }
136 
137  hll_mode mode = HllSketchImpl<A>::extractCurMode(listHeader[hll_constants::MODE_BYTE]);
138  if (mode != LIST) {
139  throw std::invalid_argument("Calling list constructor with non-list mode data");
140  }
141 
142  const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(listHeader[hll_constants::MODE_BYTE]);
143 
144  const uint8_t lgK = listHeader[hll_constants::LG_K_BYTE];
145  const bool compact = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
146  const bool oooFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::OUT_OF_ORDER_FLAG_MASK) ? true : false);
147  const bool emptyFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::EMPTY_FLAG_MASK) ? true : false);
148 
149  ClAlloc cla(allocator);
150  CouponList<A>* sketch = new (cla.allocate(1)) CouponList<A>(lgK, tgtHllType, mode, allocator);
151  using coupon_list_ptr = std::unique_ptr<CouponList<A>, std::function<void(HllSketchImpl<A>*)>>;
152  coupon_list_ptr ptr(sketch, sketch->get_deleter());
153  const uint32_t couponCount = listHeader[hll_constants::LIST_COUNT_BYTE];
154  sketch->couponCount_ = couponCount;
155  sketch->putOutOfOrderFlag(oooFlag); // should always be false for LIST
156 
157  if (!emptyFlag) {
158  // For stream processing, need to read entire number written to stream so read
159  // pointer ends up set correctly.
160  // If not compact, still need to read empty items even though in order.
161  const uint32_t numToRead = (compact ? couponCount : static_cast<uint32_t>(sketch->coupons_.size()));
162  read(is, sketch->coupons_.data(), numToRead * sizeof(uint32_t));
163  }
164 
165  if (!is.good())
166  throw std::runtime_error("error reading from std::istream");
167 
168  return ptr.release();
169 }
170 
171 template<typename A>
172 auto CouponList<A>::serialize(bool compact, unsigned header_size_bytes) const -> vector_bytes {
173  const size_t sketchSizeBytes = (compact ? getCompactSerializationBytes() : getUpdatableSerializationBytes()) + header_size_bytes;
174  vector_bytes byteArr(sketchSizeBytes, 0, getAllocator());
175  uint8_t* bytes = byteArr.data() + header_size_bytes;
176 
177  bytes[hll_constants::PREAMBLE_INTS_BYTE] = static_cast<uint8_t>(getPreInts());
178  bytes[hll_constants::SER_VER_BYTE] = static_cast<uint8_t>(hll_constants::SER_VER);
179  bytes[hll_constants::FAMILY_BYTE] = static_cast<uint8_t>(hll_constants::FAMILY_ID);
180  bytes[hll_constants::LG_K_BYTE] = static_cast<uint8_t>(this->lgConfigK_);
181  bytes[hll_constants::LG_ARR_BYTE] = count_trailing_zeros_in_u32(static_cast<uint32_t>(coupons_.size()));
182  bytes[hll_constants::FLAGS_BYTE] = this->makeFlagsByte(compact);
183  bytes[hll_constants::LIST_COUNT_BYTE] = static_cast<uint8_t>(this->mode_ == LIST ? couponCount_ : 0);
184  bytes[hll_constants::MODE_BYTE] = this->makeModeByte();
185 
186  if (this->mode_ == SET) {
187  std::memcpy(bytes + hll_constants::HASH_SET_COUNT_INT, &couponCount_, sizeof(couponCount_));
188  }
189 
190  // coupons
191  // isCompact() is always false for now
192  const int sw = (isCompact() ? 2 : 0) | (compact ? 1 : 0);
193  switch (sw) {
194  case 0: { // src updatable, dst updatable
195  std::memcpy(bytes + getMemDataStart(), coupons_.data(), coupons_.size() * sizeof(uint32_t));
196  break;
197  }
198  case 1: { // src updatable, dst compact
199  bytes += getMemDataStart(); // reusing pointer for incremental writes
200  for (const uint32_t coupon: *this) {
201  std::memcpy(bytes, &coupon, sizeof(coupon));
202  bytes += sizeof(coupon);
203  }
204  break;
205  }
206 
207  default:
208  throw std::runtime_error("Impossible condition when serializing");
209  }
210 
211  return byteArr;
212 }
213 
214 template<typename A>
215 void CouponList<A>::serialize(std::ostream& os, const bool compact) const {
216  // header
217  const uint8_t preInts = getPreInts();
218  write(os, preInts);
219  const uint8_t serialVersion(hll_constants::SER_VER);
220  write(os, serialVersion);
221  const uint8_t familyId(hll_constants::FAMILY_ID);
222  write(os, familyId);
223  const uint8_t lgKByte = this->lgConfigK_;
224  write(os, lgKByte);
225  const uint8_t lgArrIntsByte = count_trailing_zeros_in_u32(static_cast<uint32_t>(coupons_.size()));
226  write(os, lgArrIntsByte);
227  const uint8_t flagsByte = this->makeFlagsByte(compact);
228  write(os, flagsByte);
229 
230  if (this->mode_ == LIST) {
231  const uint8_t listCount = static_cast<uint8_t>(couponCount_);
232  write(os, listCount);
233  } else { // mode == SET
234  const uint8_t unused = 0;
235  write(os, unused);
236  }
237 
238  const uint8_t modeByte = this->makeModeByte();
239  write(os, modeByte);
240 
241  if (this->mode_ == SET) {
242  // writing as int, already stored as int
243  write(os, couponCount_);
244  }
245 
246  // coupons
247  // isCompact() is always false for now
248  const int sw = (isCompact() ? 2 : 0) | (compact ? 1 : 0);
249  switch (sw) {
250  case 0: { // src updatable, dst updatable
251  write(os, coupons_.data(), coupons_.size() * sizeof(uint32_t));
252  break;
253  }
254  case 1: { // src updatable, dst compact
255  for (const uint32_t coupon: *this) {
256  write(os, coupon);
257  }
258  break;
259  }
260 
261  default:
262  throw std::runtime_error("Impossible condition when serializing");
263  }
264 
265  return;
266 }
267 
268 template<typename A>
269 HllSketchImpl<A>* CouponList<A>::couponUpdate(uint32_t coupon) {
270  for (size_t i = 0; i < coupons_.size(); ++i) { // search for empty slot
271  const uint32_t couponAtIdx = coupons_[i];
272  if (couponAtIdx == hll_constants::EMPTY) {
273  coupons_[i] = coupon; // the actual update
274  ++couponCount_;
275  if (couponCount_ == static_cast<uint32_t>(coupons_.size())) { // array full
276  if (this->lgConfigK_ < 8) {
277  return promoteHeapListOrSetToHll(*this);
278  }
279  return promoteHeapListToSet(*this);
280  }
281  return this;
282  }
283  // cell not empty
284  if (couponAtIdx == coupon) {
285  return this; // duplicate
286  }
287  // cell not empty and not a duplicate, continue
288  }
289  throw std::runtime_error("Array invalid: no empties and no duplicates");
290 }
291 
292 template<typename A>
293 double CouponList<A>::getCompositeEstimate() const { return getEstimate(); }
294 
295 template<typename A>
296 double CouponList<A>::getEstimate() const {
297  const double est = CubicInterpolation<A>::usingXAndYTables(couponCount_);
298  return fmax(est, couponCount_);
299 }
300 
301 template<typename A>
302 double CouponList<A>::getLowerBound(uint8_t numStdDev) const {
303  HllUtil<A>::checkNumStdDev(numStdDev);
304  const double est = CubicInterpolation<A>::usingXAndYTables(couponCount_);
305  const double tmp = est / (1.0 + (numStdDev * hll_constants::COUPON_RSE));
306  return fmax(tmp, couponCount_);
307 }
308 
309 template<typename A>
310 double CouponList<A>::getUpperBound(uint8_t numStdDev) const {
311  HllUtil<A>::checkNumStdDev(numStdDev);
312  const double est = CubicInterpolation<A>::usingXAndYTables(couponCount_);
313  const double tmp = est / (1.0 - (numStdDev * hll_constants::COUPON_RSE));
314  return fmax(tmp, couponCount_);
315 }
316 
317 template<typename A>
318 bool CouponList<A>::isEmpty() const { return getCouponCount() == 0; }
319 
320 template<typename A>
321 uint32_t CouponList<A>::getUpdatableSerializationBytes() const {
322  return getMemDataStart() + static_cast<uint32_t>(coupons_.size()) * sizeof(uint32_t);
323 }
324 
325 template<typename A>
326 uint32_t CouponList<A>::getCouponCount() const {
327  return couponCount_;
328 }
329 
330 template<typename A>
331 uint32_t CouponList<A>::getCompactSerializationBytes() const {
332  return getMemDataStart() + (couponCount_ << 2);
333 }
334 
335 template<typename A>
336 uint32_t CouponList<A>::getMemDataStart() const {
337  return hll_constants::LIST_INT_ARR_START;
338 }
339 
340 template<typename A>
341 uint8_t CouponList<A>::getPreInts() const {
342  return hll_constants::LIST_PREINTS;
343 }
344 
345 template<typename A>
346 bool CouponList<A>::isCompact() const { return false; }
347 
348 template<typename A>
349 bool CouponList<A>::isOutOfOrderFlag() const { return oooFlag_; }
350 
351 template<typename A>
352 void CouponList<A>::putOutOfOrderFlag(bool oooFlag) {
353  oooFlag_ = oooFlag;
354 }
355 
356 template<typename A>
357 A CouponList<A>::getAllocator() const {
358  return coupons_.get_allocator();
359 }
360 
361 template<typename A>
362 HllSketchImpl<A>* CouponList<A>::promoteHeapListToSet(CouponList& list) {
363  return HllSketchImplFactory<A>::promoteListToSet(list);
364 }
365 
366 template<typename A>
367 HllSketchImpl<A>* CouponList<A>::promoteHeapListOrSetToHll(CouponList& src) {
368  return HllSketchImplFactory<A>::promoteListOrSetToHll(src);
369 }
370 
371 template<typename A>
372 coupon_iterator<A> CouponList<A>::begin(bool all) const {
373  return coupon_iterator<A>(coupons_.data(), coupons_.size(), 0, all);
374 }
375 
376 template<typename A>
377 coupon_iterator<A> CouponList<A>::end() const {
378  return coupon_iterator<A>(coupons_.data(), coupons_.size(), coupons_.size(), false);
379 }
380 
381 }
382 
383 #endif // _COUPONLIST_INTERNAL_HPP_
DataSketches namespace.
Definition: binomial_bounds.hpp:38
target_hll_type
Specifies the target type of HLL sketch to be created.
Definition: hll.hpp:72