datasketches-cpp
kll_helper_impl.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef KLL_HELPER_IMPL_HPP_
21 #define KLL_HELPER_IMPL_HPP_
22 
23 #include <algorithm>
24 #include <stdexcept>
25 
26 #include "common_defs.hpp"
27 
28 namespace datasketches {
29 
30 bool kll_helper::is_even(uint32_t value) {
31  return (value & 1) == 0;
32 }
33 
34 bool kll_helper::is_odd(uint32_t value) {
35  return (value & 1) > 0;
36 }
37 
38 uint8_t kll_helper::floor_of_log2_of_fraction(uint64_t numer, uint64_t denom) {
39  if (denom > numer) return 0;
40  uint8_t count = 0;
41  while (true) {
42  denom <<= 1;
43  if (denom > numer) return count;
44  count++;
45  }
46 }
47 
48 uint8_t kll_helper::ub_on_num_levels(uint64_t n) {
49  if (n == 0) return 1;
50  return 1 + floor_of_log2_of_fraction(n, 1);
51 }
52 
53 uint32_t kll_helper::compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_levels) {
54  uint32_t total = 0;
55  for (uint8_t h = 0; h < num_levels; h++) {
56  total += level_capacity(k, num_levels, h, m);
57  }
58  return total;
59 }
60 
61 uint16_t kll_helper::level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid) {
62  if (height >= numLevels) throw std::invalid_argument("height >= numLevels");
63  const uint8_t depth = numLevels - height - 1;
64  return std::max<uint16_t>(min_wid, int_cap_aux(k, depth));
65 }
66 
67 uint16_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) {
68  if (depth > 60) throw std::invalid_argument("depth > 60");
69  if (depth <= 30) return int_cap_aux_aux(k, depth);
70  const uint8_t half = depth / 2;
71  const uint8_t rest = depth - half;
72  const uint16_t tmp = int_cap_aux_aux(k, half);
73  return int_cap_aux_aux(tmp, rest);
74 }
75 
76 uint16_t kll_helper::int_cap_aux_aux(uint16_t k, uint8_t depth) {
77  if (depth > 30) throw std::invalid_argument("depth > 30");
78  const uint64_t twok = k << 1; // for rounding, we pre-multiply by 2
79  const uint64_t tmp = (uint64_t) (((uint64_t) twok << depth) / powers_of_three[depth]);
80  const uint64_t result = (tmp + 1) >> 1; // then here we add 1 and divide by 2
81  if (result > k) throw std::logic_error("result > k");
82  return static_cast<uint16_t>(result);
83 }
84 
85 uint64_t kll_helper::sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels) {
86  uint64_t total = 0;
87  uint64_t weight = 1;
88  for (uint8_t lvl = 0; lvl < num_levels; lvl++) {
89  total += weight * (levels[lvl + 1] - levels[lvl]);
90  weight *= 2;
91  }
92  return total;
93 }
94 
95 template <typename T>
96 void kll_helper::randomly_halve_down(T* buf, uint32_t start, uint32_t length) {
97  if (!is_even(length)) throw std::invalid_argument("length must be even");
98  const uint32_t half_length = length / 2;
99 #ifdef KLL_VALIDATION
100  const uint32_t offset = deterministic_offset();
101 #else
102  const uint32_t offset = random_utils::random_bit();
103 #endif
104  uint32_t j = start + offset;
105  for (uint32_t i = start; i < (start + half_length); i++) {
106  if (i != j) buf[i] = std::move(buf[j]);
107  j += 2;
108  }
109 }
110 
111 template <typename T>
112 void kll_helper::randomly_halve_up(T* buf, uint32_t start, uint32_t length) {
113  if (!is_even(length)) throw std::invalid_argument("length must be even");
114  const uint32_t half_length = length / 2;
115 #ifdef KLL_VALIDATION
116  const uint32_t offset = deterministic_offset();
117 #else
118  const uint32_t offset = random_utils::random_bit();
119 #endif
120  uint32_t j = (start + length) - 1 - offset;
121  for (uint32_t i = (start + length) - 1; i >= (start + half_length); i--) {
122  if (i != j) buf[i] = std::move(buf[j]);
123  j -= 2;
124  }
125 }
126 
127 // this version moves objects within the same buffer
128 // assumes that destination has initialized objects
129 // does not destroy the originals after the move
130 template <typename T, typename C>
131 void kll_helper::merge_sorted_arrays(T* buf, uint32_t start_a, uint32_t len_a, uint32_t start_b, uint32_t len_b, uint32_t start_c) {
132  const uint32_t len_c = len_a + len_b;
133  const uint32_t lim_a = start_a + len_a;
134  const uint32_t lim_b = start_b + len_b;
135  const uint32_t lim_c = start_c + len_c;
136 
137  uint32_t a = start_a;
138  uint32_t b = start_b;
139 
140  for (uint32_t c = start_c; c < lim_c; c++) {
141  if (a == lim_a) {
142  if (b != c) buf[c] = std::move(buf[b]);
143  b++;
144  } else if (b == lim_b) {
145  if (a != c) buf[c] = std::move(buf[a]);
146  a++;
147  } else if (C()(buf[a], buf[b])) {
148  if (a != c) buf[c] = std::move(buf[a]);
149  a++;
150  } else {
151  if (b != c) buf[c] = std::move(buf[b]);
152  b++;
153  }
154  }
155  if (a != lim_a || b != lim_b) throw std::logic_error("inconsistent state");
156 }
157 
158 // this version is to merge from two different buffers into a third buffer
159 // initializes objects is the destination buffer
160 // moves objects from buf_a and destroys the originals
161 // copies objects from buf_b
162 template <typename T, typename C>
163 void kll_helper::merge_sorted_arrays(const T* buf_a, uint32_t start_a, uint32_t len_a, const T* buf_b, uint32_t start_b, uint32_t len_b, T* buf_c, uint32_t start_c) {
164  const uint32_t len_c = len_a + len_b;
165  const uint32_t lim_a = start_a + len_a;
166  const uint32_t lim_b = start_b + len_b;
167  const uint32_t lim_c = start_c + len_c;
168 
169  uint32_t a = start_a;
170  uint32_t b = start_b;
171 
172  for (uint32_t c = start_c; c < lim_c; c++) {
173  if (a == lim_a) {
174  new (&buf_c[c]) T(buf_b[b++]);
175  } else if (b == lim_b) {
176  new (&buf_c[c]) T(std::move(buf_a[a]));
177  buf_a[a++].~T();
178  } else if (C()(buf_a[a], buf_b[b])) {
179  new (&buf_c[c]) T(std::move(buf_a[a]));
180  buf_a[a++].~T();
181  } else {
182  new (&buf_c[c]) T(buf_b[b++]);
183  }
184  }
185  if (a != lim_a || b != lim_b) throw std::logic_error("inconsistent state");
186 }
187 
188 /*
189  * Here is what we do for each level:
190  * If it does not need to be compacted, then simply copy it over.
191  *
192  * Otherwise, it does need to be compacted, so...
193  * Copy zero or one guy over.
194  * If the level above is empty, halve up.
195  * Else the level above is nonempty, so...
196  * halve down, then merge up.
197  * Adjust the boundaries of the level above.
198  *
199  * It can be proved that general_compress returns a sketch that satisfies the space constraints
200  * no matter how much data is passed in.
201  * All levels except for level zero must be sorted before calling this, and will still be
202  * sorted afterwards.
203  * Level zero is not required to be sorted before, and may not be sorted afterwards.
204  */
205 template <typename T, typename C>
206 kll_helper::compress_result kll_helper::general_compress(uint16_t k, uint8_t m, uint8_t num_levels_in, T* items,
207  uint32_t* in_levels, uint32_t* out_levels, bool is_level_zero_sorted)
208 {
209  if (num_levels_in == 0) throw std::invalid_argument("num_levels_in == 0"); // things are too weird if zero levels are allowed
210  const uint32_t starting_item_count = in_levels[num_levels_in] - in_levels[0];
211  uint8_t current_num_levels = num_levels_in;
212  uint32_t current_item_count = starting_item_count; // decreases with each compaction
213  uint32_t target_item_count = compute_total_capacity(k, m, current_num_levels); // increases if we add levels
214  bool done_yet = false;
215  out_levels[0] = 0;
216  uint8_t current_level = 0;
217  while (!done_yet) {
218 
219  // If we are at the current top level, add an empty level above it for convenience,
220  // but do not increment num_levels until later
221  if (current_level == (current_num_levels - 1)) {
222  in_levels[current_level + 2] = in_levels[current_level + 1];
223  }
224 
225  const auto raw_beg = in_levels[current_level];
226  const auto raw_lim = in_levels[current_level + 1];
227  const auto raw_pop = raw_lim - raw_beg;
228 
229  if ((current_item_count < target_item_count) || (raw_pop < level_capacity(k, current_num_levels, current_level, m))) {
230  // move level over as is
231  // make sure we are not moving data upwards
232  if (raw_beg < out_levels[current_level]) throw std::logic_error("wrong move");
233  if (raw_beg != out_levels[current_level])
234  std::move(items + raw_beg, items + raw_lim, items + out_levels[current_level]);
235  out_levels[current_level + 1] = out_levels[current_level] + raw_pop;
236  } else {
237  // The sketch is too full AND this level is too full, so we compact it
238  // Note: this can add a level and thus change the sketches capacities
239 
240  const auto pop_above = in_levels[current_level + 2] - raw_lim;
241  const bool odd_pop = is_odd(raw_pop);
242  const auto adj_beg = odd_pop ? 1 + raw_beg : raw_beg;
243  const auto adj_pop = odd_pop ? raw_pop - 1 : raw_pop;
244  const auto half_adj_pop = adj_pop / 2;
245 
246  if (odd_pop) { // move one guy over
247  if (out_levels[current_level] != raw_beg)
248  items[out_levels[current_level]] = std::move(items[raw_beg]);
249  out_levels[current_level + 1] = out_levels[current_level] + 1;
250  } else { // even number of items
251  out_levels[current_level + 1] = out_levels[current_level];
252  }
253 
254  // level zero might not be sorted, so we must sort it if we wish to compact it
255  if ((current_level == 0) && !is_level_zero_sorted) {
256  std::sort(items + adj_beg, items + adj_beg + adj_pop, C());
257  }
258 
259  if (pop_above == 0) { // Level above is empty, so halve up
260  randomly_halve_up(items, adj_beg, adj_pop);
261  } else { // Level above is nonempty, so halve down, then merge up
262  randomly_halve_down(items, adj_beg, adj_pop);
263  merge_sorted_arrays<T, C>(items, adj_beg, half_adj_pop, raw_lim, pop_above, adj_beg + half_adj_pop);
264  }
265 
266  // track the fact that we just eliminated some data
267  current_item_count -= half_adj_pop;
268 
269  // adjust the boundaries of the level above
270  in_levels[current_level + 1] = in_levels[current_level + 1] - half_adj_pop;
271 
272  // increment num_levels if we just compacted the old top level
273  // this creates some more capacity (the size of the new bottom level)
274  if (current_level == (current_num_levels - 1)) {
275  current_num_levels++;
276  target_item_count += level_capacity(k, current_num_levels, 0, m);
277  }
278 
279  } // end of code for compacting a level
280 
281  // determine whether we have processed all levels yet (including any new levels that we created)
282 
283  if (current_level == (current_num_levels - 1)) done_yet = true;
284  current_level++;
285  } // end of loop over levels
286 
287  if ((out_levels[current_num_levels] - out_levels[0]) != current_item_count) throw std::logic_error("inconsistent state");
288 
289  for (uint32_t i = current_item_count; i < starting_item_count; i++) items[i].~T();
290 
291  compress_result result;
292  result.final_num_levels = current_num_levels;
293  result.final_capacity = target_item_count;
294  result.final_num_items = current_item_count;
295  return result;
296 }
297 
298 template<typename T>
299 void kll_helper::copy_construct(const T* src, size_t src_first, size_t src_last, T* dst, size_t dst_first) {
300  while (src_first != src_last) {
301  new (&dst[dst_first++]) T(src[src_first++]);
302  }
303 }
304 
305 template<typename T>
306 void kll_helper::move_construct(T* src, size_t src_first, size_t src_last, T* dst, size_t dst_first, bool destroy) {
307  while (src_first != src_last) {
308  new (&dst[dst_first++]) T(std::move(src[src_first]));
309  if (destroy) src[src_first].~T();
310  src_first++;
311  }
312 }
313 
314 #ifdef KLL_VALIDATION
315 uint32_t kll_helper::deterministic_offset() {
316  const uint32_t result(kll_next_offset);
317  kll_next_offset = 1 - kll_next_offset;
318  return result;
319 }
320 #endif
321 
322 } /* namespace datasketches */
323 
324 #endif // KLL_HELPER_IMPL_HPP_
DataSketches namespace.
Definition: binomial_bounds.hpp:38