datasketches-cpp
Loading...
Searching...
No Matches
CouponList-internal.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef _COUPONLIST_INTERNAL_HPP_
21#define _COUPONLIST_INTERNAL_HPP_
22
23#include "CouponList.hpp"
24#include "CubicInterpolation.hpp"
25#include "HllUtil.hpp"
26#include "count_zeros.hpp"
27
28#include <algorithm>
29#include <cmath>
30#include <stdexcept>
31
32namespace datasketches {
33
34template<typename A>
35CouponList<A>::CouponList(uint8_t lgConfigK, target_hll_type tgtHllType, hll_mode mode, const A& allocator):
36HllSketchImpl<A>(lgConfigK, tgtHllType, mode, false),
37couponCount_(0),
38oooFlag_(false),
39coupons_(1ULL << (mode == hll_mode::LIST ? hll_constants::LG_INIT_LIST_SIZE : hll_constants::LG_INIT_SET_SIZE), 0, allocator)
40{}
41
42template<typename A>
43CouponList<A>::CouponList(const CouponList& that, const target_hll_type tgtHllType):
44HllSketchImpl<A>(that.lgConfigK_, tgtHllType, that.mode_, false),
45couponCount_(that.couponCount_),
46oooFlag_(that.oooFlag_),
47coupons_(that.coupons_)
48{}
49
50template<typename A>
51std::function<void(HllSketchImpl<A>*)> CouponList<A>::get_deleter() const {
52 return [](HllSketchImpl<A>* ptr) {
53 CouponList<A>* cl = static_cast<CouponList<A>*>(ptr);
54 ClAlloc cla(cl->getAllocator());
55 cl->~CouponList();
56 cla.deallocate(cl, 1);
57 };
58}
59
60template<typename A>
61CouponList<A>* CouponList<A>::copy() const {
62 ClAlloc cla(coupons_.get_allocator());
63 return new (cla.allocate(1)) CouponList<A>(*this);
64}
65
66template<typename A>
67CouponList<A>* CouponList<A>::copyAs(target_hll_type tgtHllType) const {
68 ClAlloc cla(coupons_.get_allocator());
69 return new (cla.allocate(1)) CouponList<A>(*this, tgtHllType);
70}
71
72template<typename A>
73CouponList<A>* CouponList<A>::newList(const void* bytes, size_t len, const A& allocator) {
74 if (len < hll_constants::LIST_INT_ARR_START) {
75 throw std::out_of_range("Input data length insufficient to hold CouponHashSet");
76 }
77
78 const uint8_t* data = static_cast<const uint8_t*>(bytes);
79 if (data[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::LIST_PREINTS) {
80 throw std::invalid_argument("Incorrect number of preInts in input stream");
81 }
82 if (data[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
83 throw std::invalid_argument("Wrong ser ver in input stream");
84 }
85 if (data[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
86 throw std::invalid_argument("Input stream is not an HLL sketch");
87 }
88
89 hll_mode mode = HllSketchImpl<A>::extractCurMode(data[hll_constants::MODE_BYTE]);
90 if (mode != LIST) {
91 throw std::invalid_argument("Calling list constructor with non-list mode data");
92 }
93
94 target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(data[hll_constants::MODE_BYTE]);
95
96 const uint8_t lgK = data[hll_constants::LG_K_BYTE];
97 const bool compact = ((data[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
98 const bool oooFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::OUT_OF_ORDER_FLAG_MASK) ? true : false);
99 const bool emptyFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::EMPTY_FLAG_MASK) ? true : false);
100
101 const uint32_t couponCount = data[hll_constants::LIST_COUNT_BYTE];
102 const uint32_t couponsInArray = (compact ? couponCount : (1 << HllUtil<A>::computeLgArrInts(LIST, couponCount, lgK)));
103 const size_t expectedLength = hll_constants::LIST_INT_ARR_START + (couponsInArray * sizeof(uint32_t));
104 if (len < expectedLength) {
105 throw std::out_of_range("Byte array too short for sketch. Expected " + std::to_string(expectedLength)
106 + ", found: " + std::to_string(len));
107 }
108
109 ClAlloc cla(allocator);
110 CouponList<A>* sketch = new (cla.allocate(1)) CouponList<A>(lgK, tgtHllType, mode, allocator);
111 sketch->couponCount_ = couponCount;
112 sketch->putOutOfOrderFlag(oooFlag); // should always be false for LIST
113
114 if (!emptyFlag) {
115 // only need to read valid coupons, unlike in stream case
116 std::memcpy(sketch->coupons_.data(), data + hll_constants::LIST_INT_ARR_START, couponCount * sizeof(uint32_t));
117 }
118
119 return sketch;
120}
121
122template<typename A>
123CouponList<A>* CouponList<A>::newList(std::istream& is, const A& allocator) {
124 uint8_t listHeader[8];
125 read(is, listHeader, 8 * sizeof(uint8_t));
126
127 if (listHeader[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::LIST_PREINTS) {
128 throw std::invalid_argument("Incorrect number of preInts in input stream");
129 }
130 if (listHeader[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
131 throw std::invalid_argument("Wrong ser ver in input stream");
132 }
133 if (listHeader[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
134 throw std::invalid_argument("Input stream is not an HLL sketch");
135 }
136
137 hll_mode mode = HllSketchImpl<A>::extractCurMode(listHeader[hll_constants::MODE_BYTE]);
138 if (mode != LIST) {
139 throw std::invalid_argument("Calling list constructor with non-list mode data");
140 }
141
142 const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(listHeader[hll_constants::MODE_BYTE]);
143
144 const uint8_t lgK = listHeader[hll_constants::LG_K_BYTE];
145 const bool compact = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
146 const bool oooFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::OUT_OF_ORDER_FLAG_MASK) ? true : false);
147 const bool emptyFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::EMPTY_FLAG_MASK) ? true : false);
148
149 ClAlloc cla(allocator);
150 CouponList<A>* sketch = new (cla.allocate(1)) CouponList<A>(lgK, tgtHllType, mode, allocator);
151 using coupon_list_ptr = std::unique_ptr<CouponList<A>, std::function<void(HllSketchImpl<A>*)>>;
152 coupon_list_ptr ptr(sketch, sketch->get_deleter());
153 const uint32_t couponCount = listHeader[hll_constants::LIST_COUNT_BYTE];
154 sketch->couponCount_ = couponCount;
155 sketch->putOutOfOrderFlag(oooFlag); // should always be false for LIST
156
157 if (!emptyFlag) {
158 // For stream processing, need to read entire number written to stream so read
159 // pointer ends up set correctly.
160 // If not compact, still need to read empty items even though in order.
161 const uint32_t numToRead = (compact ? couponCount : static_cast<uint32_t>(sketch->coupons_.size()));
162 read(is, sketch->coupons_.data(), numToRead * sizeof(uint32_t));
163 }
164
165 if (!is.good())
166 throw std::runtime_error("error reading from std::istream");
167
168 return ptr.release();
169}
170
171template<typename A>
172auto CouponList<A>::serialize(bool compact, unsigned header_size_bytes) const -> vector_bytes {
173 const size_t sketchSizeBytes = (compact ? getCompactSerializationBytes() : getUpdatableSerializationBytes()) + header_size_bytes;
174 vector_bytes byteArr(sketchSizeBytes, 0, getAllocator());
175 uint8_t* bytes = byteArr.data() + header_size_bytes;
176
177 bytes[hll_constants::PREAMBLE_INTS_BYTE] = static_cast<uint8_t>(getPreInts());
178 bytes[hll_constants::SER_VER_BYTE] = static_cast<uint8_t>(hll_constants::SER_VER);
179 bytes[hll_constants::FAMILY_BYTE] = static_cast<uint8_t>(hll_constants::FAMILY_ID);
180 bytes[hll_constants::LG_K_BYTE] = static_cast<uint8_t>(this->lgConfigK_);
181 bytes[hll_constants::LG_ARR_BYTE] = count_trailing_zeros_in_u32(static_cast<uint32_t>(coupons_.size()));
182 bytes[hll_constants::FLAGS_BYTE] = this->makeFlagsByte(compact);
183 bytes[hll_constants::LIST_COUNT_BYTE] = static_cast<uint8_t>(this->mode_ == LIST ? couponCount_ : 0);
184 bytes[hll_constants::MODE_BYTE] = this->makeModeByte();
185
186 if (this->mode_ == SET) {
187 std::memcpy(bytes + hll_constants::HASH_SET_COUNT_INT, &couponCount_, sizeof(couponCount_));
188 }
189
190 // coupons
191 // isCompact() is always false for now
192 const int sw = (isCompact() ? 2 : 0) | (compact ? 1 : 0);
193 switch (sw) {
194 case 0: { // src updatable, dst updatable
195 std::memcpy(bytes + getMemDataStart(), coupons_.data(), coupons_.size() * sizeof(uint32_t));
196 break;
197 }
198 case 1: { // src updatable, dst compact
199 bytes += getMemDataStart(); // reusing pointer for incremental writes
200 for (const uint32_t coupon: *this) {
201 std::memcpy(bytes, &coupon, sizeof(coupon));
202 bytes += sizeof(coupon);
203 }
204 break;
205 }
206
207 default:
208 throw std::runtime_error("Impossible condition when serializing");
209 }
210
211 return byteArr;
212}
213
214template<typename A>
215void CouponList<A>::serialize(std::ostream& os, const bool compact) const {
216 // header
217 const uint8_t preInts = getPreInts();
218 write(os, preInts);
219 const uint8_t serialVersion(hll_constants::SER_VER);
220 write(os, serialVersion);
221 const uint8_t familyId(hll_constants::FAMILY_ID);
222 write(os, familyId);
223 const uint8_t lgKByte = this->lgConfigK_;
224 write(os, lgKByte);
225 const uint8_t lgArrIntsByte = count_trailing_zeros_in_u32(static_cast<uint32_t>(coupons_.size()));
226 write(os, lgArrIntsByte);
227 const uint8_t flagsByte = this->makeFlagsByte(compact);
228 write(os, flagsByte);
229
230 if (this->mode_ == LIST) {
231 const uint8_t listCount = static_cast<uint8_t>(couponCount_);
232 write(os, listCount);
233 } else { // mode == SET
234 const uint8_t unused = 0;
235 write(os, unused);
236 }
237
238 const uint8_t modeByte = this->makeModeByte();
239 write(os, modeByte);
240
241 if (this->mode_ == SET) {
242 // writing as int, already stored as int
243 write(os, couponCount_);
244 }
245
246 // coupons
247 // isCompact() is always false for now
248 const int sw = (isCompact() ? 2 : 0) | (compact ? 1 : 0);
249 switch (sw) {
250 case 0: { // src updatable, dst updatable
251 write(os, coupons_.data(), coupons_.size() * sizeof(uint32_t));
252 break;
253 }
254 case 1: { // src updatable, dst compact
255 for (const uint32_t coupon: *this) {
256 write(os, coupon);
257 }
258 break;
259 }
260
261 default:
262 throw std::runtime_error("Impossible condition when serializing");
263 }
264
265 return;
266}
267
268template<typename A>
269HllSketchImpl<A>* CouponList<A>::couponUpdate(uint32_t coupon) {
270 for (size_t i = 0; i < coupons_.size(); ++i) { // search for empty slot
271 const uint32_t couponAtIdx = coupons_[i];
272 if (couponAtIdx == hll_constants::EMPTY) {
273 coupons_[i] = coupon; // the actual update
274 ++couponCount_;
275 if (couponCount_ == static_cast<uint32_t>(coupons_.size())) { // array full
276 if (this->lgConfigK_ < 8) {
277 return promoteHeapListOrSetToHll(*this);
278 }
279 return promoteHeapListToSet(*this);
280 }
281 return this;
282 }
283 // cell not empty
284 if (couponAtIdx == coupon) {
285 return this; // duplicate
286 }
287 // cell not empty and not a duplicate, continue
288 }
289 throw std::runtime_error("Array invalid: no empties and no duplicates");
290}
291
292template<typename A>
293double CouponList<A>::getCompositeEstimate() const { return getEstimate(); }
294
295template<typename A>
296double CouponList<A>::getEstimate() const {
297 const double est = CubicInterpolation<A>::usingXAndYTables(couponCount_);
298 return fmax(est, couponCount_);
299}
300
301template<typename A>
302double CouponList<A>::getLowerBound(uint8_t numStdDev) const {
303 HllUtil<A>::checkNumStdDev(numStdDev);
304 const double est = CubicInterpolation<A>::usingXAndYTables(couponCount_);
305 const double tmp = est / (1.0 + (numStdDev * hll_constants::COUPON_RSE));
306 return fmax(tmp, couponCount_);
307}
308
309template<typename A>
310double CouponList<A>::getUpperBound(uint8_t numStdDev) const {
311 HllUtil<A>::checkNumStdDev(numStdDev);
312 const double est = CubicInterpolation<A>::usingXAndYTables(couponCount_);
313 const double tmp = est / (1.0 - (numStdDev * hll_constants::COUPON_RSE));
314 return fmax(tmp, couponCount_);
315}
316
317template<typename A>
318bool CouponList<A>::isEmpty() const { return getCouponCount() == 0; }
319
320template<typename A>
321uint32_t CouponList<A>::getUpdatableSerializationBytes() const {
322 return getMemDataStart() + static_cast<uint32_t>(coupons_.size()) * sizeof(uint32_t);
323}
324
325template<typename A>
326uint32_t CouponList<A>::getCouponCount() const {
327 return couponCount_;
328}
329
330template<typename A>
331uint32_t CouponList<A>::getCompactSerializationBytes() const {
332 return getMemDataStart() + (couponCount_ << 2);
333}
334
335template<typename A>
336uint32_t CouponList<A>::getMemDataStart() const {
337 return hll_constants::LIST_INT_ARR_START;
338}
339
340template<typename A>
341uint8_t CouponList<A>::getPreInts() const {
342 return hll_constants::LIST_PREINTS;
343}
344
345template<typename A>
346bool CouponList<A>::isCompact() const { return false; }
347
348template<typename A>
349bool CouponList<A>::isOutOfOrderFlag() const { return oooFlag_; }
350
351template<typename A>
352void CouponList<A>::putOutOfOrderFlag(bool oooFlag) {
353 oooFlag_ = oooFlag;
354}
355
356template<typename A>
357A CouponList<A>::getAllocator() const {
358 return coupons_.get_allocator();
359}
360
361template<typename A>
362HllSketchImpl<A>* CouponList<A>::promoteHeapListToSet(CouponList& list) {
363 return HllSketchImplFactory<A>::promoteListToSet(list);
364}
365
366template<typename A>
367HllSketchImpl<A>* CouponList<A>::promoteHeapListOrSetToHll(CouponList& src) {
368 return HllSketchImplFactory<A>::promoteListOrSetToHll(src);
369}
370
371template<typename A>
372coupon_iterator<A> CouponList<A>::begin(bool all) const {
373 return coupon_iterator<A>(coupons_.data(), coupons_.size(), 0, all);
374}
375
376template<typename A>
377coupon_iterator<A> CouponList<A>::end() const {
378 return coupon_iterator<A>(coupons_.data(), coupons_.size(), coupons_.size(), false);
379}
380
381}
382
383#endif // _COUPONLIST_INTERNAL_HPP_
DataSketches namespace.
Definition binomial_bounds.hpp:38
target_hll_type
Specifies the target type of HLL sketch to be created.
Definition hll.hpp:72