datasketches-cpp
density_sketch.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef DENSITY_SKETCH_HPP_
21 #define DENSITY_SKETCH_HPP_
22 
23 #include <type_traits>
24 #include <vector>
25 #include <functional>
26 #include <numeric>
27 #include <cmath>
28 
29 #include "common_defs.hpp"
30 
31 namespace datasketches {
32 
33 template<typename T>
34 struct gaussian_kernel {
35  T operator()(const std::vector<T>& v1, const std::vector<T>& v2) const {
36  return exp(-std::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0, std::plus<T>(), [](T a, T b){return (a-b)*(a-b);}));
37  }
38 };
39 
52 template<
53  typename T,
54  typename Kernel = gaussian_kernel<T>,
55  typename Allocator = std::allocator<T>
56 >
58  static_assert(std::is_floating_point<T>::value, "Floating point type expected");
59 
60 public:
61  using Vector = std::vector<T, Allocator>;
62  using Level = std::vector<Vector, typename std::allocator_traits<Allocator>::template rebind_alloc<Vector>>;
63  using Levels = std::vector<Level, typename std::allocator_traits<Allocator>::template rebind_alloc<Level>>;
64 
72  density_sketch(uint16_t k, uint32_t dim, const Kernel& kernel = Kernel(), const Allocator& allocator = Allocator());
73 
78  uint16_t get_k() const;
79 
84  uint32_t get_dim() const;
85 
90  bool is_empty() const;
91 
96  uint64_t get_n() const;
97 
102  uint32_t get_num_retained() const;
103 
108  bool is_estimation_mode() const;
109 
114  template<typename FwdVector>
115  void update(FwdVector&& point);
116 
121  template<typename FwdSketch>
122  void merge(FwdSketch&& other);
123 
128  T get_estimate(const std::vector<T>& point) const;
129 
134  Allocator get_allocator() const;
135 
140  void serialize(std::ostream& os) const;
141 
142  using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
143 
151  vector_bytes serialize(unsigned header_size_bytes = 0) const;
152 
160  static density_sketch deserialize(std::istream& is,
161  const Kernel& kernel=Kernel(), const Allocator& allocator = Allocator());
162 
171  static density_sketch deserialize(const void* bytes, size_t size,
172  const Kernel& kernel=Kernel(), const Allocator& allocator = Allocator());
173 
179  string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
180 
181  class const_iterator;
182 
188  const_iterator begin() const;
189 
196  const_iterator end() const;
197 
198 private:
199  enum flags { RESERVED0, RESERVED1, IS_EMPTY };
200  static const uint8_t PREAMBLE_INTS_SHORT = 3;
201  static const uint8_t PREAMBLE_INTS_LONG = 6;
202  static const uint8_t FAMILY_ID = 19;
203  static const uint8_t SERIAL_VERSION = 1;
204  static const size_t LEVELS_ARRAY_START = 5;
205 
206  Allocator allocator_;
207  Kernel kernel_;
208  uint16_t k_;
209  uint32_t dim_;
210  uint32_t num_retained_;
211  uint64_t n_;
212  Levels levels_;
213 
214  void compact();
215  void compact_level(unsigned height);
216 
217  static void check_k(uint16_t k);
218  static void check_serial_version(uint8_t serial_version);
219  static void check_family_id(uint8_t family_id);
220  static void check_header_validity(uint8_t preamble_ints, uint8_t flags_byte, uint8_t serial_version);
221 
222  density_sketch(uint16_t k, uint32_t dim, uint32_t num_retained, uint64_t n, Levels&& levels,
223  const Kernel& kernel = Kernel());
224 };
225 
226 template<typename T, typename K, typename A>
227 class density_sketch<T, K, A>::const_iterator {
228 public:
229  using Vector = density_sketch<T, K, A>::Vector;
230  using iterator_category = std::input_iterator_tag;
231  using value_type = std::pair<const Vector&, const uint64_t>;
232  using difference_type = void;
233  using pointer = return_value_holder<value_type>;
234  using reference = const value_type;
235  const_iterator& operator++();
236  const_iterator& operator++(int);
237  bool operator==(const const_iterator& other) const;
238  bool operator!=(const const_iterator& other) const;
239  const value_type operator*() const;
240  const return_value_holder<value_type> operator->() const;
241 private:
242  using LevelsIterator = typename density_sketch<T, K, A>::Levels::const_iterator;
243  using LevelIterator = typename density_sketch<T, K, A>::Level::const_iterator;
244  LevelsIterator levels_it_;
245  LevelsIterator levels_end_;
246  LevelIterator level_it_;
247  unsigned height_;
248  friend class density_sketch<T, K, A>;
249  const_iterator(LevelsIterator begin, LevelsIterator end);
250 };
251 
252 } /* namespace datasketches */
253 
254 #include "density_sketch_impl.hpp"
255 
256 #endif
Density sketch.
Definition: density_sketch.hpp:57
density_sketch(uint16_t k, uint32_t dim, const Kernel &kernel=Kernel(), const Allocator &allocator=Allocator())
Constructor.
T get_estimate(const std::vector< T > &point) const
Density estimate at a given point.
Definition: density_sketch_impl.hpp:115
void serialize(std::ostream &os) const
This method serializes the sketch into a given stream in a binary form.
Definition: density_sketch_impl.hpp:184
void merge(FwdSketch &&other)
Merges another sketch into this one.
Definition: density_sketch_impl.hpp:98
static density_sketch deserialize(const void *bytes, size_t size, const Kernel &kernel=Kernel(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given array of bytes.
uint32_t get_num_retained() const
Returns the number of retained points in the sketch.
Definition: density_sketch_impl.hpp:77
uint32_t get_dim() const
Returns configured dimensions.
Definition: density_sketch_impl.hpp:62
string< Allocator > to_string(bool print_levels=false, bool print_items=false) const
Prints a summary of the sketch.
Definition: density_sketch_impl.hpp:420
uint16_t get_k() const
Returns configured parameter K.
Definition: density_sketch_impl.hpp:57
bool is_empty() const
Returns true if this sketch is empty.
Definition: density_sketch_impl.hpp:67
void update(FwdVector &&point)
Updates this sketch with a given point.
Definition: density_sketch_impl.hpp:88
Allocator get_allocator() const
Returns an instance of the allocator for this sketch.
Definition: density_sketch_impl.hpp:127
const_iterator begin() const
Iterator pointing to the first item in the sketch.
Definition: density_sketch_impl.hpp:468
static density_sketch deserialize(std::istream &is, const Kernel &kernel=Kernel(), const Allocator &allocator=Allocator())
This method deserializes a sketch from a given stream.
const_iterator end() const
Iterator pointing to the past-the-end item in the sketch.
Definition: density_sketch_impl.hpp:473
bool is_estimation_mode() const
Returns true if this sketch is in estimation mode.
Definition: density_sketch_impl.hpp:82
uint64_t get_n() const
Returns the length of the input stream (number of points observed by this sketch).
Definition: density_sketch_impl.hpp:72
DataSketches namespace.
Definition: binomial_bounds.hpp:38