datasketches-cpp/5.0.2/ebpps__sketch__impl_8hpp_source.html

 /*

  * Licensed to the Apache Software Foundation (ASF) under one

  * or more contributor license agreements.  See the NOTICE file

  * distributed with this work for additional information

  * regarding copyright ownership.  The ASF licenses this file

  * to you under the Apache License, Version 2.0 (the

  * "License"); you may not use this file except in compliance

  * with the License.  You may obtain a copy of the License at

  *

  *   http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing,

  * software distributed under the License is distributed on an

  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

  * KIND, either express or implied.  See the License for the

  * specific language governing permissions and limitations

  * under the License.

  */


 #ifndef _EBPPS_SKETCH_IMPL_HPP_

 #define _EBPPS_SKETCH_IMPL_HPP_


 #include <memory>

 #include <sstream>

 #include <cmath>

 #include <random>

 #include <algorithm>

 #include <stdexcept>

 #include <utility>


 #include "ebpps_sketch.hpp"


 namespace datasketches {


 template<typename T, typename A>

 ebpps_sketch<T, A>::ebpps_sketch(uint32_t k, const A& allocator) :

   allocator_(allocator),

   k_(k),

   n_(0),

   cumulative_wt_(0.0),

   wt_max_(0.0),

   rho_(1.0),

   sample_(check_k(k), allocator)

   {}


 template<typename T, typename A>

 ebpps_sketch<T,A>::ebpps_sketch(uint32_t k, uint64_t n, double cumulative_wt,

                                 double wt_max, double rho,

                                 ebpps_sample<T,A>&& sample, const A& allocator) :

   allocator_(allocator),

   k_(k),

   n_(n),

   cumulative_wt_(cumulative_wt),

   wt_max_(wt_max),

   rho_(rho),

   sample_(sample)

   {}


 template<typename T, typename A>

 uint32_t ebpps_sketch<T, A>::get_k() const {

   return k_;

 }


 template<typename T, typename A>

 uint64_t ebpps_sketch<T, A>::get_n() const {

   return n_;

 }


 template<typename T, typename A>

 double ebpps_sketch<T, A>::get_c() const {

   return sample_.get_c();

 }


 template<typename T, typename A>

 double ebpps_sketch<T, A>::get_cumulative_weight() const {

   return cumulative_wt_;

 }


 template<typename T, typename A>

 bool ebpps_sketch<T, A>::is_empty() const {

   return n_ == 0;

 }


 template<typename T, typename A>

 void ebpps_sketch<T, A>::reset() {

   n_ = 0;

   cumulative_wt_ = 0.0;

   wt_max_ = 0.0;

   rho_ = 1.0;

   sample_.reset();

 }


 template<typename T, typename A>

 string<A> ebpps_sketch<T, A>::to_string() const {

   // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.

   // The stream does not support passing an allocator instance, and alternatives are complicated.

   std::ostringstream os;

   os << "### EBPPS Sketch SUMMARY:" << std::endl;

   os << "   k            : " << k_ << std::endl;

   os << "   n            : " << n_ << std::endl;

   os << "   cum. weight  : " << cumulative_wt_ << std::endl;

   os << "   wt_mac       : " << wt_max_ << std::endl;

   os << "   rho          : " << rho_ << std::endl;

   os << "   C            : " << sample_.get_c() << std::endl;

   os << "### END SKETCH SUMMARY" << std::endl;

   return string<A>(os.str().c_str(), allocator_);

 }


 template<typename T, typename A>

 string<A> ebpps_sketch<T, A>::items_to_string() const {

   // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.

   // The stream does not support passing an allocator instance, and alternatives are complicated.

   std::ostringstream os;

   os << "### Sketch Items" << std::endl;

   os << sample_.to_string(); // assumes std::endl at end

   return string<A>(os.str().c_str(), allocator_);

 }


 template<typename T, typename A>

 A ebpps_sketch<T, A>::get_allocator() const {

   return allocator_;

 }


 template<typename T, typename A>

 void ebpps_sketch<T, A>::update(const T& item, double weight) {

   return internal_update(item, weight);

 }


 template<typename T, typename A>

 void ebpps_sketch<T, A>::update(T&& item, double weight) {

   return internal_update(std::move(item), weight);

 }


 template<typename T, typename A>

 template<typename FwdItem>

 void ebpps_sketch<T, A>::internal_update(FwdItem&& item, double weight) {

   if (weight < 0.0 || std::isnan(weight) || std::isinf(weight)) {

     throw std::invalid_argument("Item weights must be nonnegative and finite. Found: "

                                 + std::to_string(weight));

   } else if (weight == 0.0) {

     return;

   }


   const double new_cum_wt = cumulative_wt_ + weight;

   const double new_wt_max = std::max(wt_max_, weight);

   const double new_rho = std::min(1.0 / new_wt_max, k_ / new_cum_wt);


   if (cumulative_wt_ > 0.0)

     sample_.downsample(new_rho / rho_);


   ebpps_sample<T,A> tmp(conditional_forward<FwdItem>(item), new_rho * weight, allocator_);


   sample_.merge(tmp);


   cumulative_wt_ = new_cum_wt;

   wt_max_ = new_wt_max;

   rho_ = new_rho;

   ++n_;

 }


 template<typename T, typename A>

 auto ebpps_sketch<T,A>::get_result() const -> result_type {

   return sample_.get_sample();

 }


 /* Merging

  * There is a trivial merge algorithm that involves downsampling each sketch A and B

  * as A.cum_wt / (A.cum_wt + B.cum_wt) and B.cum_wt / (A.cum_wt + B.cum_wt),

  * respectively. That merge does preserve first-order probabilities, specifically

  * the probability proportional to size property, and like all other known merge

  * algorithms distorts second-order probabilities (co-occurrences). There are

  * pathological cases, most obvious with k=2 and A.cum_wt == B.cum_wt where that

  * approach will always take exactly 1 item from A and 1 from B, meaning the

  * co-occurrence rate for two items from either sketch is guaranteed to be 0.0.

  *

  * With EBPPS, once an item is accepted into the sketch we no longer need to

  * track the item's weight: All accepted items are treated equally. As a result, we

  * can take inspiration from the reservoir sampling merge in the datasketches-java

  * library. We need to merge the smaller sketch into the larger one, swapping as

  * needed to ensure that, at which point we simply call update() with the items

  * in the smaller sketch as long as we adjust the weight appropriately.

  * Merging smaller into larger is essential to ensure that no item has a

  * contribution to C > 1.0.

  */


 template<typename T, typename A>

 void ebpps_sketch<T, A>::merge(ebpps_sketch<T, A>&& sk) {

   if (sk.get_cumulative_weight() == 0.0) return;

   else if (sk.get_cumulative_weight() > get_cumulative_weight()) {

     // need to swap this with sk to merge smaller into larger

     std::swap(*this, sk);

   }


   internal_merge(sk);

 }


 template<typename T, typename A>

 void ebpps_sketch<T, A>::merge(const ebpps_sketch<T, A>& sk) {

   if (sk.get_cumulative_weight() == 0.0) return;

   else if (sk.get_cumulative_weight() > get_cumulative_weight()) {

     // need to swap this with sk to merge, so make a copy, swap,

     // and use that to merge

     ebpps_sketch sk_copy(sk);

     swap(*this, sk_copy);

     internal_merge(sk_copy);

   } else {

     internal_merge(sk);

   }

 }


 template<typename T, typename A>

 template<typename O>

 void ebpps_sketch<T, A>::internal_merge(O&& sk) {

   // assumes that sk.cumulative_wt_ <= cumulative_wt_,

   // which must be checked before calling this


   const ebpps_sample<T,A>& other_sample = sk.sample_;


   const double final_cum_wt = cumulative_wt_ + sk.cumulative_wt_;

   const double new_wt_max = std::max(wt_max_, sk.wt_max_);

   k_ = std::min(k_, sk.k_);

   const uint64_t new_n = n_ + sk.n_;


   // Insert sk's items with the cumulative weight

   // split between the input items. We repeat the same process

   // for full items and the partial item, scaling the input

   // weight appropriately.

   // We handle all C input items, meaning we always process

   // the partial item using a scaled down weight.

   // Handling the partial item by probabilistically including

   // it as a full item would be correct on average but would

   // introduce bias for any specific merge operation.

   const double avg_wt = sk.get_cumulative_weight() / sk.get_c();

   auto items = other_sample.get_full_items();

   for (size_t i = 0; i < items.size(); ++i) {

     // new_wt_max is pre-computed

     const double new_cum_wt = cumulative_wt_ + avg_wt;

     const double new_rho = std::min(1.0 / new_wt_max, k_ / new_cum_wt);


     if (cumulative_wt_ > 0.0)

       sample_.downsample(new_rho / rho_);


     ebpps_sample<T,A> tmp(conditional_forward<O>(items[i]), new_rho * avg_wt, allocator_);


     sample_.merge(tmp);


     cumulative_wt_ = new_cum_wt;

     rho_ = new_rho;

   }


   // insert partial item with weight scaled by the fractional part of C

   if (other_sample.has_partial_item()) {

     double unused;

     const double other_c_frac = std::modf(other_sample.get_c(), &unused);


     const double new_cum_wt = cumulative_wt_ + (other_c_frac * avg_wt);

     const double new_rho = std::min(1.0 / new_wt_max, k_ / new_cum_wt);


     if (cumulative_wt_ > 0.0)

       sample_.downsample(new_rho / rho_);


     ebpps_sample<T,A> tmp(conditional_forward<O>(other_sample.get_partial_item()), new_rho * other_c_frac * avg_wt, allocator_);


     sample_.merge(tmp);


     cumulative_wt_ = new_cum_wt;

     rho_ = new_rho;

   }


   // avoid numeric issues by setting cumulative weight to the

   // pre-computed value

   cumulative_wt_ = final_cum_wt;

   n_ = new_n;

 }


 /*

  * An empty sketch requires 8 bytes.

  *

  * <pre>

  * Long || Start Byte Adr:

  * Adr:

  *      ||       0        |    1   |    2   |    3   |    4   |    5   |    6   |    7   |

  *  0   || Preamble_Longs | SerVer | FamID  |  Flags |---------Max Res. Size (K)---------|

  * </pre>

  *

  * A non-empty sketch requires 40 bytes of preamble. C looks like part of

  * the preamble but is serialized as part of the internal sample state.

  *

  * The count of items seen is not used but preserved as the value seems like a useful

  * count to track.

  *

  * <pre>

  * Long || Start Byte Adr:

  * Adr:

  *      ||       0        |    1   |    2   |    3   |    4   |    5   |    6   |    7   |

  *  0   || Preamble_Longs | SerVer | FamID  |  Flags |---------Max Res. Size (K)---------|

  *

  *      ||       8        |    9   |   10   |   11   |   12   |   13   |   14   |   15   |

  *  1   ||---------------------------Items Seen Count (N)--------------------------------|

  *

  *      ||      16        |   17   |   18   |   19   |   20   |   21   |   22   |   23   |

  *  2   ||----------------------------Cumulative Weight----------------------------------|

  *

  *      ||      24        |   25   |   26   |   27   |   28   |   29   |   30   |   31   |

  *  3   ||-----------------------------Max Item Weight-----------------------------------|

  *

  *      ||      32        |   33   |   34   |   35   |   36   |   37   |   38   |   39   |

  *  4   ||----------------------------------Rho------------------------------------------|

  *

  *      ||      40        |   41   |   42   |   43   |   44   |   45   |   46   |   47   |

  *  5   ||-----------------------------------C-------------------------------------------|

  *

  *      ||      40+                      |

  *  6+  ||  {Items Array}                |

  *      ||  {Optional Item (if needed)}  |

  * </pre>

  */


 template<typename T, typename A>

 template<typename SerDe>

 size_t ebpps_sketch<T, A>::get_serialized_size_bytes(const SerDe& sd) const {

   if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }

   return (PREAMBLE_LONGS_FULL << 3) + sample_.get_serialized_size_bytes(sd);

 }


 template<typename T, typename A>

 template<typename SerDe>

 auto ebpps_sketch<T,A>::serialize(unsigned header_size_bytes, const SerDe& sd) const -> vector_bytes {

   const uint8_t prelongs = (is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_FULL);


   const size_t size = header_size_bytes + (prelongs << 3) + sample_.get_serialized_size_bytes(sd);

   vector_bytes bytes(size, 0, allocator_);

   uint8_t* ptr = bytes.data() + header_size_bytes;

   const uint8_t* end_ptr = ptr + size;


   uint8_t flags = 0;

   if (is_empty()) {

     flags |= EMPTY_FLAG_MASK;

   } else {

     flags |= sample_.has_partial_item() ? HAS_PARTIAL_ITEM_MASK : 0;

   }


   // first prelong

   const uint8_t ser_ver = SER_VER;

   const uint8_t family = FAMILY_ID;

   ptr += copy_to_mem(prelongs, ptr);

   ptr += copy_to_mem(ser_ver, ptr);

   ptr += copy_to_mem(family, ptr);

   ptr += copy_to_mem(flags, ptr);

   ptr += copy_to_mem(k_, ptr);


   if (!is_empty()) {

     // remaining preamble

     ptr += copy_to_mem(n_, ptr);

     ptr += copy_to_mem(cumulative_wt_, ptr);

     ptr += copy_to_mem(wt_max_, ptr);

     ptr += copy_to_mem(rho_, ptr);

     ptr += sample_.serialize(ptr, end_ptr, sd);

   }


   return bytes;

 }


 template<typename T, typename A>

 template<typename SerDe>

 void ebpps_sketch<T,A>::serialize(std::ostream& os, const SerDe& sd) const {

   const uint8_t prelongs = (is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_FULL);


   uint8_t flags = 0;

   if (is_empty()) {

     flags |= EMPTY_FLAG_MASK;

   } else {

     flags |= sample_.has_partial_item() ? HAS_PARTIAL_ITEM_MASK : 0;

   }


   // first prelong

   const uint8_t ser_ver = SER_VER;

   const uint8_t family = FAMILY_ID;

   write(os, prelongs);

   write(os, ser_ver);

   write(os, family);

   write(os, flags);

   write(os, k_);


   if (!is_empty()) {

     // remaining preamble

     write(os, n_);

     write(os, cumulative_wt_);

     write(os, wt_max_);

     write(os, rho_);

     sample_.serialize(os, sd);

   }


   if (!os.good()) throw std::runtime_error("error writing to std::ostream");

 }


 template<typename T, typename A>

 template<typename SerDe>

 ebpps_sketch<T,A> ebpps_sketch<T,A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {

   ensure_minimum_memory(size, 8);

   const uint8_t* ptr = static_cast<const uint8_t*>(bytes);

   const uint8_t* end_ptr = ptr + size;

   uint8_t prelongs;

   ptr += copy_from_mem(ptr, prelongs);

   uint8_t serial_version;

   ptr += copy_from_mem(ptr, serial_version);

   uint8_t family_id;

   ptr += copy_from_mem(ptr, family_id);

   uint8_t flags;

   ptr += copy_from_mem(ptr, flags);

   uint32_t k;

   ptr += copy_from_mem(ptr, k);


   check_k(k);

   check_preamble_longs(prelongs, flags);

   check_family_and_serialization_version(family_id, serial_version);

   ensure_minimum_memory(size, prelongs << 3);


   const bool empty = flags & EMPTY_FLAG_MASK;

   if (empty)

     return ebpps_sketch(k, allocator);


   uint64_t n;

   ptr += copy_from_mem(ptr, n);

   double cumulative_wt;

   ptr += copy_from_mem(ptr, cumulative_wt);

   double wt_max;

   ptr += copy_from_mem(ptr, wt_max);

   double rho;

   ptr += copy_from_mem(ptr, rho);


   auto pair = ebpps_sample<T, A>::deserialize(ptr, end_ptr - ptr, sd, allocator);

   ebpps_sample<T, A> sample = pair.first;

   ptr += pair.second;


   if (sample.has_partial_item() != bool(flags & HAS_PARTIAL_ITEM_MASK))

     throw std::runtime_error("sketch fails internal consistency check");


   return ebpps_sketch(k, n, cumulative_wt, wt_max, rho, std::move(sample), allocator);

 }


 template<typename T, typename A>

 template<typename SerDe>

 ebpps_sketch<T,A> ebpps_sketch<T,A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {

   const uint8_t prelongs = read<uint8_t>(is);

   const uint8_t ser_ver = read<uint8_t>(is);

   const uint8_t family = read<uint8_t>(is);

   const uint8_t flags = read<uint8_t>(is);

   const uint32_t k = read<uint32_t>(is);


   check_k(k);

   check_family_and_serialization_version(family, ser_ver);

   check_preamble_longs(prelongs, flags);


   const bool empty = (flags & EMPTY_FLAG_MASK);


   if (empty)

     return ebpps_sketch(k, allocator);


   const uint64_t n = read<uint64_t>(is);

   const double cumulative_wt = read<double>(is);

   const double wt_max = read<double>(is);

   const double rho = read<double>(is);


   auto sample = ebpps_sample<T,A>::deserialize(is, sd, allocator);


   if (sample.has_partial_item() != bool(flags & HAS_PARTIAL_ITEM_MASK))

     throw std::runtime_error("sketch fails internal consistency check");


   return ebpps_sketch(k, n, cumulative_wt, wt_max, rho, std::move(sample), allocator);

 }


 template <typename T, typename A>

 inline uint32_t ebpps_sketch<T, A>::check_k(uint32_t k)

 {

   if (k == 0 || k > MAX_K)

     throw std::invalid_argument("k must be strictly positive and less than " + std::to_string(MAX_K));

   return k;

 }


 template<typename T, typename A>

 void ebpps_sketch<T, A>::check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver) {

   if (family_id == FAMILY_ID) {

     if (ser_ver != SER_VER) {

       throw std::invalid_argument("Possible corruption: EBPPS serialization version must be "

         + std::to_string(SER_VER) + ". Found: " + std::to_string(ser_ver));

     }

     return;

   }


   throw std::invalid_argument("Possible corruption: EBPPS Sketch family id must be "

     + std::to_string(FAMILY_ID) + ". Found: " + std::to_string(family_id));

 }


 template <typename T, typename A>

 void ebpps_sketch<T, A>::check_preamble_longs(uint8_t preamble_longs, uint8_t flags)

 {

   const bool is_empty(flags & EMPTY_FLAG_MASK);


   if (is_empty) {

     if (preamble_longs != PREAMBLE_LONGS_EMPTY) {

       throw std::invalid_argument("Possible corruption: Preamble longs must be "

         + std::to_string(PREAMBLE_LONGS_EMPTY) + " for an empty sketch. Found: "

         + std::to_string(preamble_longs));

     }

     if (flags & HAS_PARTIAL_ITEM_MASK) {

       throw std::invalid_argument("Possible corruption: Empty sketch must not "

         "contain indications of the presence of any item");

     }

   } else {

     if (preamble_longs != PREAMBLE_LONGS_FULL) {

       throw std::invalid_argument("Possible corruption: Preamble longs must be "

         + std::to_string(PREAMBLE_LONGS_FULL)

         + " for a non-empty sketch. Found: " + std::to_string(preamble_longs));

     }

   }

 }


 template<typename T, typename A>

 typename ebpps_sample<T, A>::const_iterator ebpps_sketch<T, A>::begin() const {

   return sample_.begin();

 }


 template<typename T, typename A>

 typename ebpps_sample<T, A>::const_iterator ebpps_sketch<T, A>::end() const {

   return sample_.end();

 }


 } // namespace datasketches


 #endif // _EBPPS_SKETCH_IMPL_HPP_

datasketches::ebpps_sketch
An implementation of an Exact and Bounded Sampling Proportional to Size sketch.
Definition: ebpps_sketch.hpp:59

datasketches::ebpps_sketch::items_to_string
string< A > items_to_string() const
Prints the raw sketch items to a string.
Definition: ebpps_sketch_impl.hpp:110

datasketches::ebpps_sketch::update
void update(const T &item, double weight=1.0)
Updates this sketch with the given data item with the given weight.
Definition: ebpps_sketch_impl.hpp:125

datasketches::ebpps_sketch::ebpps_sketch
ebpps_sketch(uint32_t k, const A &allocator=A())
Constructor.
Definition: ebpps_sketch_impl.hpp:36

datasketches::ebpps_sketch::serialize
vector_bytes serialize(unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const
This method serializes the sketch as a vector of bytes.

datasketches::ebpps_sketch::end
ebpps_sample< T, A >::const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition: ebpps_sketch_impl.hpp:525

datasketches::ebpps_sketch::is_empty
bool is_empty() const
Returns true if the sketch is empty.
Definition: ebpps_sketch_impl.hpp:80

datasketches::ebpps_sketch::begin
ebpps_sample< T, A >::const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition: ebpps_sketch_impl.hpp:520

datasketches::ebpps_sketch::get_serialized_size_bytes
size_t get_serialized_size_bytes(const SerDe &sd=SerDe()) const
Computes size needed to serialize the current state of the sketch.
Definition: ebpps_sketch_impl.hpp:321

datasketches::ebpps_sketch::get_cumulative_weight
double get_cumulative_weight() const
Returns the cumulative weight of items processed by the sketch.
Definition: ebpps_sketch_impl.hpp:75

datasketches::ebpps_sketch::get_allocator
A get_allocator() const
Returns an instance of the allocator for this sketch.
Definition: ebpps_sketch_impl.hpp:120

datasketches::ebpps_sketch::merge
void merge(const ebpps_sketch< T, A > &sketch)
Merges the provided sketch into the current one.
Definition: ebpps_sketch_impl.hpp:198

datasketches::ebpps_sketch::deserialize
static ebpps_sketch deserialize(const void *bytes, size_t size, const SerDe &sd=SerDe(), const A &allocator=A())
This method deserializes a sketch from a given array of bytes.

datasketches::ebpps_sketch::reset
void reset()
Resets the sketch to its default, empty state.
Definition: ebpps_sketch_impl.hpp:85

datasketches::ebpps_sketch::get_c
double get_c() const
Returns the expected number of samples returned upon a call to get_result() or the creation of an ite...
Definition: ebpps_sketch_impl.hpp:70

datasketches::ebpps_sketch::to_string
string< A > to_string() const
Prints a summary of the sketch.
Definition: ebpps_sketch_impl.hpp:94

datasketches::ebpps_sketch::get_result
result_type get_result() const
Returns a copy of the current sample, as a std::vector.
Definition: ebpps_sketch_impl.hpp:162

datasketches::ebpps_sketch::get_k
uint32_t get_k() const
Returns the configured maximum sample size.
Definition: ebpps_sketch_impl.hpp:60

datasketches::ebpps_sketch::get_n
uint64_t get_n() const
Returns the number of items processed by the sketch, regardless of item weight.
Definition: ebpps_sketch_impl.hpp:65

datasketches
DataSketches namespace.
Definition: binomial_bounds.hpp:38