20 #ifndef DENSITY_SKETCH_IMPL_HPP_
21 #define DENSITY_SKETCH_IMPL_HPP_
26 #include "memory_operations.hpp"
27 #include "conditional_forward.hpp"
31 template<
typename T,
typename K,
typename A>
38 levels_(1, Level(allocator), allocator)
43 template<
typename T,
typename K,
typename A>
45 Levels&& levels,
const K& kernel):
49 num_retained_(num_retained),
51 levels_(std::move(levels))
56 template<
typename T,
typename K,
typename A>
61 template<
typename T,
typename K,
typename A>
66 template<
typename T,
typename K,
typename A>
68 return num_retained_ == 0;
71 template<
typename T,
typename K,
typename A>
76 template<
typename T,
typename K,
typename A>
81 template<
typename T,
typename K,
typename A>
83 return levels_.size() > 1;
86 template<
typename T,
typename K,
typename A>
87 template<
typename FwdVector>
89 if (point.size() != dim_)
throw std::invalid_argument(
"dimension mismatch");
90 while (num_retained_ >= k_ * levels_.size()) compact();
91 levels_[0].push_back(std::forward<FwdVector>(point));
96 template<
typename T,
typename K,
typename A>
97 template<
typename FwdSketch>
99 if (other.is_empty())
return;
100 if (other.dim_ != dim_)
throw std::invalid_argument(
"dimension mismatch");
101 while (levels_.size() < other.levels_.size()) levels_.push_back(Level(levels_.get_allocator()));
102 for (
unsigned height = 0; height < other.levels_.size(); ++height) {
104 forward_begin(conditional_forward<FwdSketch>(other.levels_[height])),
105 forward_end(conditional_forward<FwdSketch>(other.levels_[height])),
106 back_inserter(levels_[height])
109 num_retained_ += other.num_retained_;
111 while (num_retained_ >= k_ * levels_.size()) compact();
114 template<
typename T,
typename K,
typename A>
116 if (is_empty())
throw std::runtime_error(
"operation is undefined for an empty sketch");
118 for (
unsigned height = 0; height < levels_.size(); ++height) {
119 for (
const auto& p: levels_[height]) {
120 density += (1 << height) * kernel_(p, point) / n_;
126 template<
typename T,
typename K,
typename A>
131 template<
typename T,
typename K,
typename A>
133 for (
unsigned height = 0; height < levels_.size(); ++height) {
134 if (levels_[height].size() >= k_) {
135 if (height + 1 >= levels_.size()) levels_.push_back(Level(levels_.get_allocator()));
136 compact_level(height);
142 template<
typename T,
typename K,
typename A>
144 auto& level = levels_[height];
145 std::vector<bool> bits(level.size());
146 bits[0] = random_utils::random_bit();
147 std::shuffle(level.begin(), level.end(), random_utils::rand);
148 for (
unsigned i = 1; i < level.size(); ++i) {
150 for (
unsigned j = 0; j < i; ++j) {
151 delta += (bits[j] ? 1 : -1) * kernel_(level[i], level[j]);
155 for (
unsigned i = 0; i < level.size(); ++i) {
157 levels_[height + 1].push_back(std::move(level[i]));
183 template<
typename T,
typename K,
typename A>
185 const uint8_t preamble_ints = is_empty() ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_LONG;
186 write(os, preamble_ints);
187 const uint8_t ser_ver = SERIAL_VERSION;
189 const uint8_t family = FAMILY_ID;
193 const uint8_t flags_byte = (is_empty() ? 1 << flags::IS_EMPTY : 0);
194 write(os, flags_byte);
196 const uint16_t unused = 0;
203 write(os, num_retained_);
207 size_t pt_size =
sizeof(T) * dim_;
208 for (
const Level& lvl : levels_) {
209 const uint32_t level_size =
static_cast<uint32_t
>(lvl.size());
210 write(os, level_size);
211 for (
const Vector& pt : lvl) {
212 write(os, pt.data(), pt_size);
217 template<
typename T,
typename K,
typename A>
219 const uint8_t preamble_ints = (is_empty() ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_LONG);
222 size_t size = header_size_bytes + preamble_ints *
sizeof(uint32_t);
224 for (
const Level& lvl : levels_)
225 size +=
sizeof(uint32_t) + (lvl.size() * dim_ *
sizeof(T));
227 vector_bytes bytes(size, 0, levels_.get_allocator());
228 uint8_t* ptr = bytes.data() + header_size_bytes;
229 const uint8_t* end_ptr = ptr + size;
231 ptr += copy_to_mem(preamble_ints, ptr);
232 const uint8_t ser_ver = SERIAL_VERSION;
233 ptr += copy_to_mem(ser_ver, ptr);
234 const uint8_t family = FAMILY_ID;
235 ptr += copy_to_mem(family, ptr);
238 const uint8_t flags_byte = (is_empty() ? 1 << flags::IS_EMPTY : 0);
239 ptr += copy_to_mem(flags_byte, ptr);
240 ptr += copy_to_mem(k_, ptr);
241 ptr +=
sizeof(uint16_t);
242 ptr += copy_to_mem(dim_, ptr);
247 ptr += copy_to_mem(num_retained_, ptr);
248 ptr += copy_to_mem(n_, ptr);
251 size_t pt_size =
sizeof(T) * dim_;
252 for (
const Level& lvl : levels_) {
253 ptr += copy_to_mem(
static_cast<uint32_t
>(lvl.size()), ptr);
254 for (
const Vector& pt : lvl) {
255 ptr += copy_to_mem(pt.data(), ptr, pt_size);
260 throw std::runtime_error(
"Actual output size does not equal expected output size");
265 template<
typename T,
typename K,
typename A>
267 const auto preamble_ints = read<uint8_t>(is);
268 const auto serial_version = read<uint8_t>(is);
269 const auto family_id = read<uint8_t>(is);
270 const auto flags_byte = read<uint8_t>(is);
271 const auto k = read<uint16_t>(is);
273 const auto dim = read<uint32_t>(is);
276 check_serial_version(serial_version);
277 check_family_id(family_id);
278 check_header_validity(preamble_ints, flags_byte, serial_version);
280 if (!is.good())
throw std::runtime_error(
"error reading from std::istream");
281 const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
286 const auto num_retained = read<uint32_t>(is);
287 const auto n = read<uint64_t>(is);
290 size_t pt_size =
sizeof(T) * dim;
291 Levels levels(allocator);
292 int64_t num_to_read = num_retained;
293 while (num_to_read > 0) {
294 const auto level_size = read<uint32_t>(is);
295 Level lvl(allocator);
296 lvl.reserve(level_size);
297 for (uint32_t i = 0; i < level_size; ++i) {
298 Vector pt(dim, 0, allocator);
299 read(is, pt.data(), pt_size);
302 levels.push_back(lvl);
303 num_to_read -= lvl.size();
306 if (num_to_read != 0)
307 throw std::runtime_error(
"Error deserializing sketch: Incorrect number of items read");
308 if (!is.good())
throw std::runtime_error(
"error reading from std::istream");
310 return density_sketch(k, dim, num_retained, n, std::move(levels), kernel);
313 template<
typename T,
typename K,
typename A>
315 ensure_minimum_memory(size, PREAMBLE_INTS_SHORT *
sizeof(uint32_t));
316 const char* ptr =
static_cast<const char*
>(bytes);
317 const char* end_ptr =
static_cast<const char*
>(bytes) + size;
318 uint8_t preamble_ints;
319 ptr += copy_from_mem(ptr, preamble_ints);
320 uint8_t serial_version;
321 ptr += copy_from_mem(ptr, serial_version);
323 ptr += copy_from_mem(ptr, family_id);
325 ptr += copy_from_mem(ptr, flags_byte);
327 ptr += copy_from_mem(ptr, k);
329 ptr += copy_from_mem(ptr, unused);
331 ptr += copy_from_mem(ptr, dim);
334 check_serial_version(serial_version);
335 check_family_id(family_id);
336 check_header_validity(preamble_ints, flags_byte, serial_version);
338 const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
340 return density_sketch(k, dim, kernel, allocator);
343 ensure_minimum_memory(size, PREAMBLE_INTS_LONG *
sizeof(uint32_t));
344 uint32_t num_retained;
345 ptr += copy_from_mem(ptr, num_retained);
347 ptr += copy_from_mem(ptr, n);
352 size_t pt_size =
sizeof(T) * dim;
353 ensure_minimum_memory(end_ptr - ptr, num_retained * pt_size);
356 Levels levels(allocator);
357 int64_t num_to_read = num_retained;
358 while (num_to_read > 0) {
360 ptr += copy_from_mem(ptr, level_size);
361 ensure_minimum_memory(end_ptr - ptr, level_size * pt_size);
362 Level lvl(allocator);
363 lvl.reserve(level_size);
364 for (uint32_t i = 0; i < level_size; ++i) {
365 Vector pt(dim, 0, allocator);
366 ptr += copy_from_mem(ptr, pt.data(), pt_size);
369 levels.push_back(lvl);
370 num_to_read -= lvl.size();
373 if (num_to_read != 0)
374 throw std::runtime_error(
"Error deserializing sketch: Incorrect number of items read");
375 if (ptr > end_ptr)
throw std::runtime_error(
"Error deserializing sketch: Read beyond provided memory");
377 return density_sketch(k, dim, num_retained, n, std::move(levels), kernel);
380 template<
typename T,
typename K,
typename A>
381 void density_sketch<T, K, A>::check_k(uint16_t k) {
383 throw std::invalid_argument(
"k must be > 1. Found: " + std::to_string(k));
386 template<
typename T,
typename K,
typename A>
387 void density_sketch<T, K, A>::check_serial_version(uint8_t serial_version) {
388 if (serial_version == SERIAL_VERSION)
391 throw std::invalid_argument(
"Possible corruption. Unrecognized serialization version: " + std::to_string(serial_version));
394 template<
typename T,
typename K,
typename A>
395 void density_sketch<T, K, A>::check_family_id(uint8_t family_id) {
396 if (family_id == FAMILY_ID)
399 throw std::invalid_argument(
"Possible corruption. Family id does not indicate density sketch: " + std::to_string(family_id));
402 template<
typename T,
typename K,
typename A>
403 void density_sketch<T, K, A>::check_header_validity(uint8_t preamble_ints, uint8_t flags_byte, uint8_t serial_version) {
404 const bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
406 if ((empty && preamble_ints == PREAMBLE_INTS_SHORT)
407 || (!empty && preamble_ints == PREAMBLE_INTS_LONG))
410 std::ostringstream os;
411 os <<
"Possible sketch corruption. Inconsistent state: "
412 <<
"preamble_ints = " << preamble_ints
413 <<
", empty = " << (empty ?
"true" :
"false")
414 <<
", serialization_version = " << serial_version;
415 throw std::invalid_argument(os.str());
419 template<
typename T,
typename K,
typename A>
423 std::ostringstream os;
424 os <<
"### Density sketch summary:" << std::endl;
425 os <<
" K : " << k_ << std::endl;
426 os <<
" Dim : " << dim_ << std::endl;
427 os <<
" Empty : " << (is_empty() ?
"true" :
"false") << std::endl;
428 os <<
" N : " << n_ << std::endl;
429 os <<
" Retained items : " << num_retained_ << std::endl;
430 os <<
" Estimation mode: " << (is_estimation_mode() ?
"true" :
"false") << std::endl;
431 os <<
" Levels : " << levels_.size() << std::endl;
432 os <<
"### End sketch summary" << std::endl;
435 os <<
"### Density sketch levels:" << std::endl;
436 os <<
" height: size" << std::endl;
437 for (
unsigned height = 0; height < levels_.size(); ++height) {
438 os <<
" " << height <<
": "
439 << levels_[height].size() << std::endl;
441 os <<
"### End sketch levels" << std::endl;
445 os <<
"### Density sketch data:" << std::endl;
446 for (
unsigned height = 0; height < levels_.size(); ++height) {
447 os <<
" level " << height <<
": " << std::endl;
448 for (
const auto& point: levels_[height]) {
451 for (
auto value: point) {
459 os <<
"]" << std::endl;
462 os <<
"### End sketch data" << std::endl;
464 return string<A>(os.str().c_str(), levels_.get_allocator());
467 template<
typename T,
typename K,
typename A>
469 return const_iterator(levels_.begin(), levels_.end());
472 template<
typename T,
typename K,
typename A>
474 return const_iterator(levels_.end(), levels_.end());
479 template<
typename T,
typename K,
typename A>
487 while (levels_it_ != levels_end_) {
488 level_it_ = levels_it_->begin();
489 if (level_it_ != levels_it_->end())
break;
495 template<
typename T,
typename K,
typename A>
496 auto density_sketch<T, K, A>::const_iterator::operator++() -> const_iterator& {
498 if (level_it_ == levels_it_->end()) {
502 while (levels_it_ != levels_end_) {
503 level_it_ = levels_it_->begin();
504 if (level_it_ != levels_it_->end())
break;
512 template<
typename T,
typename K,
typename A>
513 auto density_sketch<T, K, A>::const_iterator::operator++(
int) -> const_iterator& {
514 const_iterator tmp(*
this);
519 template<
typename T,
typename K,
typename A>
520 bool density_sketch<T, K, A>::const_iterator::operator==(
const const_iterator& other)
const {
521 if (levels_it_ != other.levels_it_)
return false;
522 if (levels_it_ == levels_end_)
return true;
523 return level_it_ == other.level_it_;
526 template<
typename T,
typename K,
typename A>
527 bool density_sketch<T, K, A>::const_iterator::operator!=(
const const_iterator& other)
const {
528 return !operator==(other);
531 template<
typename T,
typename K,
typename A>
532 auto density_sketch<T, K, A>::const_iterator::operator*() const -> const value_type {
533 return value_type(*level_it_, 1ULL << height_);
536 template<
typename T,
typename K,
typename A>
537 auto density_sketch<T, K, A>::const_iterator::operator->() const -> const return_value_holder<value_type> {
Density sketch.
Definition: density_sketch.hpp:57
density_sketch(uint16_t k, uint32_t dim, const Kernel &kernel=Kernel(), const Allocator &allocator=Allocator())
Constructor.
Allocator get_allocator() const
Returns an instance of the allocator for this sketch.
Definition: density_sketch_impl.hpp:127
static density_sketch deserialize(std::istream &is, const K &kernel=K(), const A &allocator=A())
This method deserializes a sketch from a given stream.
DataSketches namespace.
Definition: binomial_bounds.hpp:38