datasketches-cpp
theta_union_base_impl.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef THETA_UNION_BASE_IMPL_HPP_
21 #define THETA_UNION_BASE_IMPL_HPP_
22 
23 #include <algorithm>
24 #include <stdexcept>
25 
26 #include "conditional_forward.hpp"
27 
28 namespace datasketches {
29 
30 template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
31 theta_union_base<EN, EK, P, S, CS, A>::theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
32  float p, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
33 policy_(policy),
34 table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator),
35 union_theta_(table_.theta_)
36 {}
37 
38 template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
39 template<typename SS>
40 void theta_union_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
41  if (sketch.is_empty()) return;
42  if (sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
43  table_.is_empty_ = false;
44  union_theta_ = std::min(union_theta_, sketch.get_theta64());
45  for (auto&& entry: sketch) {
46  const uint64_t hash = EK()(entry);
47  if (hash < union_theta_ && hash < table_.theta_) {
48  auto result = table_.find(hash);
49  if (!result.second) {
50  table_.insert(result.first, conditional_forward<SS>(entry));
51  } else {
52  policy_(*result.first, conditional_forward<SS>(entry));
53  }
54  } else {
55  if (sketch.is_ordered()) break; // early stop
56  }
57  }
58  union_theta_ = std::min(union_theta_, table_.theta_);
59 }
60 
61 template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
62 CS theta_union_base<EN, EK, P, S, CS, A>::get_result(bool ordered) const {
63  std::vector<EN, A> entries(table_.allocator_);
64  if (table_.is_empty_) return CS(true, true, compute_seed_hash(table_.seed_), union_theta_, std::move(entries));
65  entries.reserve(table_.num_entries_);
66  uint64_t theta = std::min(union_theta_, table_.theta_);
67  const uint32_t nominal_num = 1 << table_.lg_nom_size_;
68  if (union_theta_ >= table_.theta_) {
69  std::copy_if(table_.begin(), table_.end(), std::back_inserter(entries), key_not_zero<EN, EK>());
70  } else {
71  std::copy_if(table_.begin(), table_.end(), std::back_inserter(entries), key_not_zero_less_than<uint64_t, EN, EK>(theta));
72  }
73  if (entries.size() > nominal_num) {
74  std::nth_element(entries.begin(), entries.begin() + nominal_num, entries.end(), comparator());
75  theta = EK()(entries[nominal_num]);
76  entries.erase(entries.begin() + nominal_num, entries.end());
77  entries.shrink_to_fit();
78  }
79  if (ordered) std::sort(entries.begin(), entries.end(), comparator());
80  return CS(table_.is_empty_, ordered, compute_seed_hash(table_.seed_), theta, std::move(entries));
81 }
82 
83 template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
84 const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
85  return policy_;
86 }
87 
88 template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
89 void theta_union_base<EN, EK, P, S, CS, A>::reset() {
90  table_.reset();
91  union_theta_ = table_.theta_;
92 }
93 
94 } /* namespace datasketches */
95 
96 #endif
DataSketches namespace.
Definition: binomial_bounds.hpp:38