datasketches-cpp
CouponHashSet-internal.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef _COUPONHASHSET_INTERNAL_HPP_
21 #define _COUPONHASHSET_INTERNAL_HPP_
22 
23 #include "CouponHashSet.hpp"
24 
25 #include <cstring>
26 #include <exception>
27 #include <stdexcept>
28 
29 namespace datasketches {
30 
31 template<typename A>
32 static int32_t find(const uint32_t* array, uint8_t lgArrInts, uint32_t coupon);
33 
34 template<typename A>
35 CouponHashSet<A>::CouponHashSet(uint8_t lgConfigK, target_hll_type tgtHllType, const A& allocator)
36  : CouponList<A>(lgConfigK, tgtHllType, hll_mode::SET, allocator)
37 {
38  if (lgConfigK <= 7) {
39  throw std::invalid_argument("CouponHashSet must be initialized with lgConfigK > 7. Found: "
40  + std::to_string(lgConfigK));
41  }
42 }
43 
44 template<typename A>
45 CouponHashSet<A>::CouponHashSet(const CouponHashSet<A>& that, const target_hll_type tgtHllType)
46  : CouponList<A>(that, tgtHllType) {}
47 
48 template<typename A>
49 std::function<void(HllSketchImpl<A>*)> CouponHashSet<A>::get_deleter() const {
50  return [](HllSketchImpl<A>* ptr) {
51  CouponHashSet<A>* chs = static_cast<CouponHashSet<A>*>(ptr);
52  ChsAlloc chsa(chs->getAllocator());
53  chs->~CouponHashSet();
54  chsa.deallocate(chs, 1);
55  };
56 }
57 
58 template<typename A>
59 CouponHashSet<A>* CouponHashSet<A>::newSet(const void* bytes, size_t len, const A& allocator) {
60  if (len < hll_constants::HASH_SET_INT_ARR_START) { // hard-coded
61  throw std::out_of_range("Input data length insufficient to hold CouponHashSet");
62  }
63 
64  const uint8_t* data = static_cast<const uint8_t*>(bytes);
65  if (data[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::HASH_SET_PREINTS) {
66  throw std::invalid_argument("Incorrect number of preInts in input stream");
67  }
68  if (data[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
69  throw std::invalid_argument("Wrong ser ver in input stream");
70  }
71  if (data[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
72  throw std::invalid_argument("Input stream is not an HLL sketch");
73  }
74 
75  const hll_mode mode = HllSketchImpl<A>::extractCurMode(data[hll_constants::MODE_BYTE]);
76  if (mode != SET) {
77  throw std::invalid_argument("Calling set constructor with non-set mode data");
78  }
79 
80  const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(data[hll_constants::MODE_BYTE]);
81 
82  const uint8_t lgK = data[hll_constants::LG_K_BYTE];
83  if (lgK <= 7) {
84  throw std::invalid_argument("Attempt to deserialize invalid CouponHashSet with lgConfigK <= 7. Found: "
85  + std::to_string(lgK));
86  }
87  uint8_t lgArrInts = data[hll_constants::LG_ARR_BYTE];
88  const bool compactFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
89 
90  uint32_t couponCount;
91  std::memcpy(&couponCount, data + hll_constants::HASH_SET_COUNT_INT, sizeof(couponCount));
92  if (lgArrInts < hll_constants::LG_INIT_SET_SIZE) {
93  lgArrInts = HllUtil<>::computeLgArrInts(SET, couponCount, lgK);
94  }
95  // Don't set couponCount in sketch here;
96  // we'll set later if updatable, and increment with updates if compact
97  const uint32_t couponsInArray = (compactFlag ? couponCount : (1 << lgArrInts));
98  const size_t expectedLength = hll_constants::HASH_SET_INT_ARR_START + (couponsInArray * sizeof(uint32_t));
99  if (len < expectedLength) {
100  throw std::out_of_range("Byte array too short for sketch. Expected " + std::to_string(expectedLength)
101  + ", found: " + std::to_string(len));
102  }
103 
104  ChsAlloc chsa(allocator);
105  CouponHashSet<A>* sketch = new (chsa.allocate(1)) CouponHashSet<A>(lgK, tgtHllType, allocator);
106 
107  if (compactFlag) {
108  const uint8_t* curPos = data + hll_constants::HASH_SET_INT_ARR_START;
109  uint32_t coupon;
110  for (uint32_t i = 0; i < couponCount; ++i, curPos += sizeof(coupon)) {
111  std::memcpy(&coupon, curPos, sizeof(coupon));
112  sketch->couponUpdate(coupon);
113  }
114  } else {
115  sketch->coupons_.resize(1ULL << lgArrInts);
116  sketch->couponCount_ = couponCount;
117  std::memcpy(sketch->coupons_.data(),
118  data + hll_constants::HASH_SET_INT_ARR_START,
119  couponsInArray * sizeof(uint32_t));
120  }
121 
122  return sketch;
123 }
124 
125 template<typename A>
126 CouponHashSet<A>* CouponHashSet<A>::newSet(std::istream& is, const A& allocator) {
127  uint8_t listHeader[8];
128  read(is, listHeader, 8 * sizeof(uint8_t));
129 
130  if (listHeader[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::HASH_SET_PREINTS) {
131  throw std::invalid_argument("Incorrect number of preInts in input stream");
132  }
133  if (listHeader[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
134  throw std::invalid_argument("Wrong ser ver in input stream");
135  }
136  if (listHeader[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
137  throw std::invalid_argument("Input stream is not an HLL sketch");
138  }
139 
140  hll_mode mode = HllSketchImpl<A>::extractCurMode(listHeader[hll_constants::MODE_BYTE]);
141  if (mode != SET) {
142  throw std::invalid_argument("Calling set constructor with non-set mode data");
143  }
144 
145  const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(listHeader[hll_constants::MODE_BYTE]);
146 
147  const uint8_t lgK = listHeader[hll_constants::LG_K_BYTE];
148  if (lgK <= 7) {
149  throw std::invalid_argument("Attempt to deserialize invalid CouponHashSet with lgConfigK <= 7. Found: "
150  + std::to_string(lgK));
151  }
152  uint8_t lgArrInts = listHeader[hll_constants::LG_ARR_BYTE];
153  const bool compactFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
154 
155  const auto couponCount = read<uint32_t>(is);
156  if (lgArrInts < hll_constants::LG_INIT_SET_SIZE) {
157  lgArrInts = HllUtil<>::computeLgArrInts(SET, couponCount, lgK);
158  }
159 
160  ChsAlloc chsa(allocator);
161  CouponHashSet<A>* sketch = new (chsa.allocate(1)) CouponHashSet<A>(lgK, tgtHllType, allocator);
162  typedef std::unique_ptr<CouponHashSet<A>, std::function<void(HllSketchImpl<A>*)>> coupon_hash_set_ptr;
163  coupon_hash_set_ptr ptr(sketch, sketch->get_deleter());
164 
165  // Don't set couponCount here;
166  // we'll set later if updatable, and increment with updates if compact
167  if (compactFlag) {
168  for (uint32_t i = 0; i < couponCount; ++i) {
169  const auto coupon = read<uint32_t>(is);
170  sketch->couponUpdate(coupon);
171  }
172  } else {
173  sketch->coupons_.resize(1ULL << lgArrInts);
174  sketch->couponCount_ = couponCount;
175  // for stream processing, read entire list so read pointer ends up set correctly
176  read(is, sketch->coupons_.data(), sketch->coupons_.size() * sizeof(uint32_t));
177  }
178 
179  if (!is.good())
180  throw std::runtime_error("error reading from std::istream");
181 
182  return ptr.release();
183 }
184 
185 template<typename A>
186 CouponHashSet<A>* CouponHashSet<A>::copy() const {
187  ChsAlloc chsa(this->coupons_.get_allocator());
188  return new (chsa.allocate(1)) CouponHashSet<A>(*this);
189 }
190 
191 template<typename A>
192 CouponHashSet<A>* CouponHashSet<A>::copyAs(target_hll_type tgtHllType) const {
193  ChsAlloc chsa(this->coupons_.get_allocator());
194  return new (chsa.allocate(1)) CouponHashSet<A>(*this, tgtHllType);
195 }
196 
197 template<typename A>
198 HllSketchImpl<A>* CouponHashSet<A>::couponUpdate(uint32_t coupon) {
199  const uint8_t lgCouponArrInts = count_trailing_zeros_in_u32(static_cast<uint32_t>(this->coupons_.size()));
200  const int32_t index = find<A>(this->coupons_.data(), lgCouponArrInts, coupon);
201  if (index >= 0) {
202  return this; // found duplicate, ignore
203  }
204  this->coupons_[~index] = coupon; // found empty
205  ++this->couponCount_;
206  if (checkGrowOrPromote()) {
207  return this->promoteHeapListOrSetToHll(*this);
208  }
209  return this;
210 }
211 
212 template<typename A>
213 uint32_t CouponHashSet<A>::getMemDataStart() const {
214  return hll_constants::HASH_SET_INT_ARR_START;
215 }
216 
217 template<typename A>
218 uint8_t CouponHashSet<A>::getPreInts() const {
219  return hll_constants::HASH_SET_PREINTS;
220 }
221 
222 template<typename A>
223 bool CouponHashSet<A>::checkGrowOrPromote() {
224  if (static_cast<size_t>(hll_constants::RESIZE_DENOM * this->couponCount_) > (hll_constants::RESIZE_NUMER * this->coupons_.size())) {
225  const uint8_t lgCouponArrInts = count_trailing_zeros_in_u32(static_cast<uint32_t>(this->coupons_.size()));
226  if (lgCouponArrInts == (this->lgConfigK_ - 3)) { // at max size
227  return true; // promote to HLL
228  }
229  growHashSet(lgCouponArrInts + 1);
230  }
231  return false;
232 }
233 
234 template<typename A>
235 void CouponHashSet<A>::growHashSet(uint8_t tgtLgCoupArrSize) {
236  const uint32_t tgtLen = 1 << tgtLgCoupArrSize;
237  vector_int coupons_new(tgtLen, 0, this->coupons_.get_allocator());
238 
239  const uint32_t srcLen = static_cast<uint32_t>(this->coupons_.size());
240  for (uint32_t i = 0; i < srcLen; ++i) { // scan existing array for non-zero values
241  const uint32_t fetched = this->coupons_[i];
242  if (fetched != hll_constants::EMPTY) {
243  const int32_t idx = find<A>(coupons_new.data(), tgtLgCoupArrSize, fetched); // search TGT array
244  if (idx < 0) { // found EMPTY
245  coupons_new[~idx] = fetched; // insert
246  continue;
247  }
248  throw std::runtime_error("Error: Found duplicate coupon");
249  }
250  }
251  this->coupons_ = std::move(coupons_new);
252 }
253 
254 template<typename A>
255 static int32_t find(const uint32_t* array, uint8_t lgArrInts, uint32_t coupon) {
256  const uint32_t arrMask = (1 << lgArrInts) - 1;
257  uint32_t probe = coupon & arrMask;
258  const uint32_t loopIndex = probe;
259  do {
260  const uint32_t couponAtIdx = array[probe];
261  if (couponAtIdx == hll_constants::EMPTY) {
262  return ~probe; //empty
263  }
264  else if (coupon == couponAtIdx) {
265  return probe; //duplicate
266  }
267  const uint32_t stride = ((coupon & hll_constants::KEY_MASK_26) >> lgArrInts) | 1;
268  probe = (probe + stride) & arrMask;
269  } while (probe != loopIndex);
270  throw std::invalid_argument("Key not found and no empty slots!");
271 }
272 
273 }
274 
275 #endif // _COUPONHASHSET_INTERNAL_HPP_
DataSketches namespace.
Definition: binomial_bounds.hpp:38
target_hll_type
Specifies the target type of HLL sketch to be created.
Definition: hll.hpp:72