datasketches-cpp
Loading...
Searching...
No Matches
CouponList-internal.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef _COUPONLIST_INTERNAL_HPP_
21#define _COUPONLIST_INTERNAL_HPP_
22
23#include "CouponList.hpp"
24#include "CubicInterpolation.hpp"
25#include "HllUtil.hpp"
26#include "count_zeros.hpp"
27
28#include <algorithm>
29#include <cmath>
30#include <stdexcept>
31
32namespace datasketches {
33
34template<typename A>
35CouponList<A>::CouponList(uint8_t lgConfigK, target_hll_type tgtHllType, hll_mode mode, const A& allocator):
36HllSketchImpl<A>(lgConfigK, tgtHllType, mode, false),
37couponCount_(0),
38oooFlag_(false),
39coupons_(1ULL << (mode == hll_mode::LIST ? hll_constants::LG_INIT_LIST_SIZE : hll_constants::LG_INIT_SET_SIZE), 0, allocator)
40{}
41
42template<typename A>
43CouponList<A>::CouponList(const CouponList& that, const target_hll_type tgtHllType):
44HllSketchImpl<A>(that.lgConfigK_, tgtHllType, that.mode_, false),
45couponCount_(that.couponCount_),
46oooFlag_(that.oooFlag_),
47coupons_(that.coupons_)
48{}
49
50template<typename A>
51std::function<void(HllSketchImpl<A>*)> CouponList<A>::get_deleter() const {
52 return [](HllSketchImpl<A>* ptr) {
53 CouponList<A>* cl = static_cast<CouponList<A>*>(ptr);
54 ClAlloc cla(cl->getAllocator());
55 cl->~CouponList();
56 cla.deallocate(cl, 1);
57 };
58}
59
60template<typename A>
61CouponList<A>* CouponList<A>::copy() const {
62 ClAlloc cla(coupons_.get_allocator());
63 return new (cla.allocate(1)) CouponList<A>(*this);
64}
65
66template<typename A>
67CouponList<A>* CouponList<A>::copyAs(target_hll_type tgtHllType) const {
68 ClAlloc cla(coupons_.get_allocator());
69 return new (cla.allocate(1)) CouponList<A>(*this, tgtHllType);
70}
71
72template<typename A>
73CouponList<A>* CouponList<A>::newList(const void* bytes, size_t len, const A& allocator) {
74 if (len < hll_constants::LIST_INT_ARR_START) {
75 throw std::out_of_range("Input data length insufficient to hold CouponHashSet");
76 }
77
78 const uint8_t* data = static_cast<const uint8_t*>(bytes);
79 if (data[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::LIST_PREINTS) {
80 throw std::invalid_argument("Incorrect number of preInts in input stream");
81 }
82 if (data[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
83 throw std::invalid_argument("Wrong ser ver in input stream");
84 }
85 if (data[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
86 throw std::invalid_argument("Input stream is not an HLL sketch");
87 }
88
89 hll_mode mode = HllSketchImpl<A>::extractCurMode(data[hll_constants::MODE_BYTE]);
90 if (mode != LIST) {
91 throw std::invalid_argument("Calling list constructor with non-list mode data");
92 }
93
94 target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(data[hll_constants::MODE_BYTE]);
95
96 const uint8_t lgK = data[hll_constants::LG_K_BYTE];
97 const bool compact = ((data[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
98 const bool oooFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::OUT_OF_ORDER_FLAG_MASK) ? true : false);
99 const bool emptyFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::EMPTY_FLAG_MASK) ? true : false);
100
101 const uint32_t couponCount = data[hll_constants::LIST_COUNT_BYTE];
102 const uint32_t couponsInArray = (compact ? couponCount : (1 << HllUtil<A>::computeLgArrInts(LIST, couponCount, lgK)));
103 const size_t expectedLength = hll_constants::LIST_INT_ARR_START + (couponsInArray * sizeof(uint32_t));
104 if (len < expectedLength) {
105 throw std::out_of_range("Byte array too short for sketch. Expected " + std::to_string(expectedLength)
106 + ", found: " + std::to_string(len));
107 }
108
109 ClAlloc cla(allocator);
110 CouponList<A>* sketch = new (cla.allocate(1)) CouponList<A>(lgK, tgtHllType, mode, allocator);
111 sketch->couponCount_ = couponCount;
112 sketch->putOutOfOrderFlag(oooFlag); // should always be false for LIST
113
114 if (!emptyFlag) {
115 // only need to read valid coupons, unlike in stream case
116 std::memcpy(sketch->coupons_.data(), data + hll_constants::LIST_INT_ARR_START, couponCount * sizeof(uint32_t));
117 }
118
119 return sketch;
120}
121
122template<typename A>
123CouponList<A>* CouponList<A>::newList(std::istream& is, const A& allocator) {
124 uint8_t listHeader[8];
125 read(is, listHeader, 8 * sizeof(uint8_t));
126
127 if (listHeader[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::LIST_PREINTS) {
128 throw std::invalid_argument("Incorrect number of preInts in input stream");
129 }
130 if (listHeader[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
131 throw std::invalid_argument("Wrong ser ver in input stream");
132 }
133 if (listHeader[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
134 throw std::invalid_argument("Input stream is not an HLL sketch");
135 }
136
137 hll_mode mode = HllSketchImpl<A>::extractCurMode(listHeader[hll_constants::MODE_BYTE]);
138 if (mode != LIST) {
139 throw std::invalid_argument("Calling list constructor with non-list mode data");
140 }
141
142 const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(listHeader[hll_constants::MODE_BYTE]);
143
144 const uint8_t lgK = listHeader[hll_constants::LG_K_BYTE];
145 const bool compact = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
146 const bool oooFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::OUT_OF_ORDER_FLAG_MASK) ? true : false);
147 const bool emptyFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::EMPTY_FLAG_MASK) ? true : false);
148
149 ClAlloc cla(allocator);
150 CouponList<A>* sketch = new (cla.allocate(1)) CouponList<A>(lgK, tgtHllType, mode, allocator);
151 using coupon_list_ptr = std::unique_ptr<CouponList<A>, std::function<void(HllSketchImpl<A>*)>>;
152 coupon_list_ptr ptr(sketch, sketch->get_deleter());
153 const uint32_t couponCount = listHeader[hll_constants::LIST_COUNT_BYTE];
154 sketch->couponCount_ = couponCount;
155 sketch->putOutOfOrderFlag(oooFlag); // should always be false for LIST
156
157 if (!emptyFlag) {
158 // For stream processing, need to read entire number written to stream so read
159 // pointer ends up set correctly.
160 // If not compact, still need to read empty items even though in order.
161 const uint32_t numToRead = (compact ? couponCount : static_cast<uint32_t>(sketch->coupons_.size()));
162 read(is, sketch->coupons_.data(), numToRead * sizeof(uint32_t));
163 }
164
165 if (!is.good()) { throw std::runtime_error("error reading from std::istream"); }
166
167 return ptr.release();
168}
169
170template<typename A>
171auto CouponList<A>::serialize(bool compact, unsigned header_size_bytes) const -> vector_bytes {
172 const size_t sketchSizeBytes = (compact ? getCompactSerializationBytes() : getUpdatableSerializationBytes()) + header_size_bytes;
173 vector_bytes byteArr(sketchSizeBytes, 0, getAllocator());
174 uint8_t* bytes = byteArr.data() + header_size_bytes;
175
176 bytes[hll_constants::PREAMBLE_INTS_BYTE] = static_cast<uint8_t>(getPreInts());
177 bytes[hll_constants::SER_VER_BYTE] = static_cast<uint8_t>(hll_constants::SER_VER);
178 bytes[hll_constants::FAMILY_BYTE] = static_cast<uint8_t>(hll_constants::FAMILY_ID);
179 bytes[hll_constants::LG_K_BYTE] = static_cast<uint8_t>(this->lgConfigK_);
180 bytes[hll_constants::LG_ARR_BYTE] = count_trailing_zeros_in_u32(static_cast<uint32_t>(coupons_.size()));
181 bytes[hll_constants::FLAGS_BYTE] = this->makeFlagsByte(compact);
182 bytes[hll_constants::LIST_COUNT_BYTE] = static_cast<uint8_t>(this->mode_ == LIST ? couponCount_ : 0);
183 bytes[hll_constants::MODE_BYTE] = this->makeModeByte();
184
185 if (this->mode_ == SET) {
186 std::memcpy(bytes + hll_constants::HASH_SET_COUNT_INT, &couponCount_, sizeof(couponCount_));
187 }
188
189 // coupons
190 // isCompact() is always false for now
191 const int sw = (isCompact() ? 2 : 0) | (compact ? 1 : 0);
192 switch (sw) {
193 case 0: { // src updatable, dst updatable
194 std::memcpy(bytes + getMemDataStart(), coupons_.data(), coupons_.size() * sizeof(uint32_t));
195 break;
196 }
197 case 1: { // src updatable, dst compact
198 bytes += getMemDataStart(); // reusing pointer for incremental writes
199 for (const uint32_t coupon: *this) {
200 std::memcpy(bytes, &coupon, sizeof(coupon));
201 bytes += sizeof(coupon);
202 }
203 break;
204 }
205
206 default:
207 throw std::runtime_error("Impossible condition when serializing");
208 }
209
210 return byteArr;
211}
212
213template<typename A>
214void CouponList<A>::serialize(std::ostream& os, const bool compact) const {
215 // header
216 const uint8_t preInts = getPreInts();
217 write(os, preInts);
218 const uint8_t serialVersion(hll_constants::SER_VER);
219 write(os, serialVersion);
220 const uint8_t familyId(hll_constants::FAMILY_ID);
221 write(os, familyId);
222 const uint8_t lgKByte = this->lgConfigK_;
223 write(os, lgKByte);
224 const uint8_t lgArrIntsByte = count_trailing_zeros_in_u32(static_cast<uint32_t>(coupons_.size()));
225 write(os, lgArrIntsByte);
226 const uint8_t flagsByte = this->makeFlagsByte(compact);
227 write(os, flagsByte);
228
229 if (this->mode_ == LIST) {
230 const uint8_t listCount = static_cast<uint8_t>(couponCount_);
231 write(os, listCount);
232 } else { // mode == SET
233 const uint8_t unused = 0;
234 write(os, unused);
235 }
236
237 const uint8_t modeByte = this->makeModeByte();
238 write(os, modeByte);
239
240 if (this->mode_ == SET) {
241 // writing as int, already stored as int
242 write(os, couponCount_);
243 }
244
245 // coupons
246 // isCompact() is always false for now
247 const int sw = (isCompact() ? 2 : 0) | (compact ? 1 : 0);
248 switch (sw) {
249 case 0: { // src updatable, dst updatable
250 write(os, coupons_.data(), coupons_.size() * sizeof(uint32_t));
251 break;
252 }
253 case 1: { // src updatable, dst compact
254 for (const uint32_t coupon: *this) {
255 write(os, coupon);
256 }
257 break;
258 }
259
260 default:
261 throw std::runtime_error("Impossible condition when serializing");
262 }
263
264 return;
265}
266
267template<typename A>
268HllSketchImpl<A>* CouponList<A>::couponUpdate(uint32_t coupon) {
269 for (size_t i = 0; i < coupons_.size(); ++i) { // search for empty slot
270 const uint32_t couponAtIdx = coupons_[i];
271 if (couponAtIdx == hll_constants::EMPTY) {
272 coupons_[i] = coupon; // the actual update
273 ++couponCount_;
274 if (couponCount_ == static_cast<uint32_t>(coupons_.size())) { // array full
275 if (this->lgConfigK_ < 8) {
276 return promoteHeapListOrSetToHll(*this);
277 }
278 return promoteHeapListToSet(*this);
279 }
280 return this;
281 }
282 // cell not empty
283 if (couponAtIdx == coupon) {
284 return this; // duplicate
285 }
286 // cell not empty and not a duplicate, continue
287 }
288 throw std::runtime_error("Array invalid: no empties and no duplicates");
289}
290
291template<typename A>
292double CouponList<A>::getCompositeEstimate() const { return getEstimate(); }
293
294template<typename A>
295double CouponList<A>::getEstimate() const {
296 const double est = CubicInterpolation<A>::usingXAndYTables(couponCount_);
297 return fmax(est, couponCount_);
298}
299
300template<typename A>
301double CouponList<A>::getLowerBound(uint8_t numStdDev) const {
302 HllUtil<A>::checkNumStdDev(numStdDev);
303 const double est = CubicInterpolation<A>::usingXAndYTables(couponCount_);
304 const double tmp = est / (1.0 + (numStdDev * hll_constants::COUPON_RSE));
305 return fmax(tmp, couponCount_);
306}
307
308template<typename A>
309double CouponList<A>::getUpperBound(uint8_t numStdDev) const {
310 HllUtil<A>::checkNumStdDev(numStdDev);
311 const double est = CubicInterpolation<A>::usingXAndYTables(couponCount_);
312 const double tmp = est / (1.0 - (numStdDev * hll_constants::COUPON_RSE));
313 return fmax(tmp, couponCount_);
314}
315
316template<typename A>
317bool CouponList<A>::isEmpty() const { return getCouponCount() == 0; }
318
319template<typename A>
320uint32_t CouponList<A>::getUpdatableSerializationBytes() const {
321 return getMemDataStart() + static_cast<uint32_t>(coupons_.size()) * sizeof(uint32_t);
322}
323
324template<typename A>
325uint32_t CouponList<A>::getCouponCount() const {
326 return couponCount_;
327}
328
329template<typename A>
330uint32_t CouponList<A>::getCompactSerializationBytes() const {
331 return getMemDataStart() + (couponCount_ << 2);
332}
333
334template<typename A>
335uint32_t CouponList<A>::getMemDataStart() const {
336 return hll_constants::LIST_INT_ARR_START;
337}
338
339template<typename A>
340uint8_t CouponList<A>::getPreInts() const {
341 return hll_constants::LIST_PREINTS;
342}
343
344template<typename A>
345bool CouponList<A>::isCompact() const { return false; }
346
347template<typename A>
348bool CouponList<A>::isOutOfOrderFlag() const { return oooFlag_; }
349
350template<typename A>
351void CouponList<A>::putOutOfOrderFlag(bool oooFlag) {
352 oooFlag_ = oooFlag;
353}
354
355template<typename A>
356A CouponList<A>::getAllocator() const {
357 return coupons_.get_allocator();
358}
359
360template<typename A>
361HllSketchImpl<A>* CouponList<A>::promoteHeapListToSet(CouponList& list) {
362 return HllSketchImplFactory<A>::promoteListToSet(list);
363}
364
365template<typename A>
366HllSketchImpl<A>* CouponList<A>::promoteHeapListOrSetToHll(CouponList& src) {
367 return HllSketchImplFactory<A>::promoteListOrSetToHll(src);
368}
369
370template<typename A>
371coupon_iterator<A> CouponList<A>::begin(bool all) const {
372 return coupon_iterator<A>(coupons_.data(), coupons_.size(), 0, all);
373}
374
375template<typename A>
376coupon_iterator<A> CouponList<A>::end() const {
377 return coupon_iterator<A>(coupons_.data(), coupons_.size(), coupons_.size(), false);
378}
379
380}
381
382#endif // _COUPONLIST_INTERNAL_HPP_
DataSketches namespace.
Definition binomial_bounds.hpp:38
target_hll_type
Specifies the target type of HLL sketch to be created.
Definition hll.hpp:72