datasketches-cpp
Loading...
Searching...
No Matches
CouponHashSet-internal.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef _COUPONHASHSET_INTERNAL_HPP_
21#define _COUPONHASHSET_INTERNAL_HPP_
22
23#include "CouponHashSet.hpp"
24
25#include <cstring>
26#include <exception>
27#include <stdexcept>
28
29namespace datasketches {
30
31template<typename A>
32static int32_t find(const uint32_t* array, uint8_t lgArrInts, uint32_t coupon);
33
34template<typename A>
35CouponHashSet<A>::CouponHashSet(uint8_t lgConfigK, target_hll_type tgtHllType, const A& allocator)
36 : CouponList<A>(lgConfigK, tgtHllType, hll_mode::SET, allocator)
37{
38 if (lgConfigK <= 7) {
39 throw std::invalid_argument("CouponHashSet must be initialized with lgConfigK > 7. Found: "
40 + std::to_string(lgConfigK));
41 }
42}
43
44template<typename A>
45CouponHashSet<A>::CouponHashSet(const CouponHashSet<A>& that, const target_hll_type tgtHllType)
46 : CouponList<A>(that, tgtHllType) {}
47
48template<typename A>
49std::function<void(HllSketchImpl<A>*)> CouponHashSet<A>::get_deleter() const {
50 return [](HllSketchImpl<A>* ptr) {
51 CouponHashSet<A>* chs = static_cast<CouponHashSet<A>*>(ptr);
52 ChsAlloc chsa(chs->getAllocator());
53 chs->~CouponHashSet();
54 chsa.deallocate(chs, 1);
55 };
56}
57
58template<typename A>
59CouponHashSet<A>* CouponHashSet<A>::newSet(const void* bytes, size_t len, const A& allocator) {
60 if (len < hll_constants::HASH_SET_INT_ARR_START) { // hard-coded
61 throw std::out_of_range("Input data length insufficient to hold CouponHashSet");
62 }
63
64 const uint8_t* data = static_cast<const uint8_t*>(bytes);
65 if (data[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::HASH_SET_PREINTS) {
66 throw std::invalid_argument("Incorrect number of preInts in input stream");
67 }
68 if (data[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
69 throw std::invalid_argument("Wrong ser ver in input stream");
70 }
71 if (data[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
72 throw std::invalid_argument("Input stream is not an HLL sketch");
73 }
74
75 const hll_mode mode = HllSketchImpl<A>::extractCurMode(data[hll_constants::MODE_BYTE]);
76 if (mode != SET) {
77 throw std::invalid_argument("Calling set constructor with non-set mode data");
78 }
79
80 const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(data[hll_constants::MODE_BYTE]);
81
82 const uint8_t lgK = data[hll_constants::LG_K_BYTE];
83 if (lgK <= 7) {
84 throw std::invalid_argument("Attempt to deserialize invalid CouponHashSet with lgConfigK <= 7. Found: "
85 + std::to_string(lgK));
86 }
87 uint8_t lgArrInts = data[hll_constants::LG_ARR_BYTE];
88 const bool compactFlag = ((data[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
89
90 uint32_t couponCount;
91 std::memcpy(&couponCount, data + hll_constants::HASH_SET_COUNT_INT, sizeof(couponCount));
92 if (lgArrInts < hll_constants::LG_INIT_SET_SIZE) {
93 lgArrInts = HllUtil<>::computeLgArrInts(SET, couponCount, lgK);
94 }
95 // Don't set couponCount in sketch here;
96 // we'll set later if updatable, and increment with updates if compact
97 const uint32_t couponsInArray = (compactFlag ? couponCount : (1 << lgArrInts));
98 const size_t expectedLength = hll_constants::HASH_SET_INT_ARR_START + (couponsInArray * sizeof(uint32_t));
99 if (len < expectedLength) {
100 throw std::out_of_range("Byte array too short for sketch. Expected " + std::to_string(expectedLength)
101 + ", found: " + std::to_string(len));
102 }
103
104 ChsAlloc chsa(allocator);
105 CouponHashSet<A>* sketch = new (chsa.allocate(1)) CouponHashSet<A>(lgK, tgtHllType, allocator);
106
107 if (compactFlag) {
108 const uint8_t* curPos = data + hll_constants::HASH_SET_INT_ARR_START;
109 uint32_t coupon;
110 for (uint32_t i = 0; i < couponCount; ++i, curPos += sizeof(coupon)) {
111 std::memcpy(&coupon, curPos, sizeof(coupon));
112 sketch->couponUpdate(coupon);
113 }
114 } else {
115 sketch->coupons_.resize(1ULL << lgArrInts);
116 sketch->couponCount_ = couponCount;
117 std::memcpy(sketch->coupons_.data(),
118 data + hll_constants::HASH_SET_INT_ARR_START,
119 couponsInArray * sizeof(uint32_t));
120 }
121
122 return sketch;
123}
124
125template<typename A>
126CouponHashSet<A>* CouponHashSet<A>::newSet(std::istream& is, const A& allocator) {
127 uint8_t listHeader[8];
128 read(is, listHeader, 8 * sizeof(uint8_t));
129
130 if (listHeader[hll_constants::PREAMBLE_INTS_BYTE] != hll_constants::HASH_SET_PREINTS) {
131 throw std::invalid_argument("Incorrect number of preInts in input stream");
132 }
133 if (listHeader[hll_constants::SER_VER_BYTE] != hll_constants::SER_VER) {
134 throw std::invalid_argument("Wrong ser ver in input stream");
135 }
136 if (listHeader[hll_constants::FAMILY_BYTE] != hll_constants::FAMILY_ID) {
137 throw std::invalid_argument("Input stream is not an HLL sketch");
138 }
139
140 hll_mode mode = HllSketchImpl<A>::extractCurMode(listHeader[hll_constants::MODE_BYTE]);
141 if (mode != SET) {
142 throw std::invalid_argument("Calling set constructor with non-set mode data");
143 }
144
145 const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(listHeader[hll_constants::MODE_BYTE]);
146
147 const uint8_t lgK = listHeader[hll_constants::LG_K_BYTE];
148 if (lgK <= 7) {
149 throw std::invalid_argument("Attempt to deserialize invalid CouponHashSet with lgConfigK <= 7. Found: "
150 + std::to_string(lgK));
151 }
152 uint8_t lgArrInts = listHeader[hll_constants::LG_ARR_BYTE];
153 const bool compactFlag = ((listHeader[hll_constants::FLAGS_BYTE] & hll_constants::COMPACT_FLAG_MASK) ? true : false);
154
155 const auto couponCount = read<uint32_t>(is);
156 if (lgArrInts < hll_constants::LG_INIT_SET_SIZE) {
157 lgArrInts = HllUtil<>::computeLgArrInts(SET, couponCount, lgK);
158 }
159
160 ChsAlloc chsa(allocator);
161 CouponHashSet<A>* sketch = new (chsa.allocate(1)) CouponHashSet<A>(lgK, tgtHllType, allocator);
162 typedef std::unique_ptr<CouponHashSet<A>, std::function<void(HllSketchImpl<A>*)>> coupon_hash_set_ptr;
163 coupon_hash_set_ptr ptr(sketch, sketch->get_deleter());
164
165 // Don't set couponCount here;
166 // we'll set later if updatable, and increment with updates if compact
167 if (compactFlag) {
168 for (uint32_t i = 0; i < couponCount; ++i) {
169 const auto coupon = read<uint32_t>(is);
170 sketch->couponUpdate(coupon);
171 }
172 } else {
173 sketch->coupons_.resize(1ULL << lgArrInts);
174 sketch->couponCount_ = couponCount;
175 // for stream processing, read entire list so read pointer ends up set correctly
176 read(is, sketch->coupons_.data(), sketch->coupons_.size() * sizeof(uint32_t));
177 }
178
179 if (!is.good())
180 throw std::runtime_error("error reading from std::istream");
181
182 return ptr.release();
183}
184
185template<typename A>
186CouponHashSet<A>* CouponHashSet<A>::copy() const {
187 ChsAlloc chsa(this->coupons_.get_allocator());
188 return new (chsa.allocate(1)) CouponHashSet<A>(*this);
189}
190
191template<typename A>
192CouponHashSet<A>* CouponHashSet<A>::copyAs(target_hll_type tgtHllType) const {
193 ChsAlloc chsa(this->coupons_.get_allocator());
194 return new (chsa.allocate(1)) CouponHashSet<A>(*this, tgtHllType);
195}
196
197template<typename A>
198HllSketchImpl<A>* CouponHashSet<A>::couponUpdate(uint32_t coupon) {
199 const uint8_t lgCouponArrInts = count_trailing_zeros_in_u32(static_cast<uint32_t>(this->coupons_.size()));
200 const int32_t index = find<A>(this->coupons_.data(), lgCouponArrInts, coupon);
201 if (index >= 0) {
202 return this; // found duplicate, ignore
203 }
204 this->coupons_[~index] = coupon; // found empty
205 ++this->couponCount_;
206 if (checkGrowOrPromote()) {
207 return this->promoteHeapListOrSetToHll(*this);
208 }
209 return this;
210}
211
212template<typename A>
213uint32_t CouponHashSet<A>::getMemDataStart() const {
214 return hll_constants::HASH_SET_INT_ARR_START;
215}
216
217template<typename A>
218uint8_t CouponHashSet<A>::getPreInts() const {
219 return hll_constants::HASH_SET_PREINTS;
220}
221
222template<typename A>
223bool CouponHashSet<A>::checkGrowOrPromote() {
224 if (static_cast<size_t>(hll_constants::RESIZE_DENOM * this->couponCount_) > (hll_constants::RESIZE_NUMER * this->coupons_.size())) {
225 const uint8_t lgCouponArrInts = count_trailing_zeros_in_u32(static_cast<uint32_t>(this->coupons_.size()));
226 if (lgCouponArrInts == (this->lgConfigK_ - 3)) { // at max size
227 return true; // promote to HLL
228 }
229 growHashSet(lgCouponArrInts + 1);
230 }
231 return false;
232}
233
234template<typename A>
235void CouponHashSet<A>::growHashSet(uint8_t tgtLgCoupArrSize) {
236 const uint32_t tgtLen = 1 << tgtLgCoupArrSize;
237 vector_int coupons_new(tgtLen, 0, this->coupons_.get_allocator());
238
239 const uint32_t srcLen = static_cast<uint32_t>(this->coupons_.size());
240 for (uint32_t i = 0; i < srcLen; ++i) { // scan existing array for non-zero values
241 const uint32_t fetched = this->coupons_[i];
242 if (fetched != hll_constants::EMPTY) {
243 const int32_t idx = find<A>(coupons_new.data(), tgtLgCoupArrSize, fetched); // search TGT array
244 if (idx < 0) { // found EMPTY
245 coupons_new[~idx] = fetched; // insert
246 continue;
247 }
248 throw std::runtime_error("Error: Found duplicate coupon");
249 }
250 }
251 this->coupons_ = std::move(coupons_new);
252}
253
254template<typename A>
255static int32_t find(const uint32_t* array, uint8_t lgArrInts, uint32_t coupon) {
256 const uint32_t arrMask = (1 << lgArrInts) - 1;
257 uint32_t probe = coupon & arrMask;
258 const uint32_t loopIndex = probe;
259 do {
260 const uint32_t couponAtIdx = array[probe];
261 if (couponAtIdx == hll_constants::EMPTY) {
262 return ~probe; //empty
263 }
264 else if (coupon == couponAtIdx) {
265 return probe; //duplicate
266 }
267 const uint32_t stride = ((coupon & hll_constants::KEY_MASK_26) >> lgArrInts) | 1;
268 probe = (probe + stride) & arrMask;
269 } while (probe != loopIndex);
270 throw std::invalid_argument("Key not found and no empty slots!");
271}
272
273}
274
275#endif // _COUPONHASHSET_INTERNAL_HPP_
DataSketches namespace.
Definition binomial_bounds.hpp:38
target_hll_type
Specifies the target type of HLL sketch to be created.
Definition hll.hpp:72