datasketches-cpp
theta_set_difference_base_impl.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
21 #define THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
22 
23 #include <algorithm>
24 #include <stdexcept>
25 
26 #include "conditional_back_inserter.hpp"
27 #include "conditional_forward.hpp"
28 
29 namespace datasketches {
30 
31 template<typename EN, typename EK, typename CS, typename A>
32 theta_set_difference_base<EN, EK, CS, A>::theta_set_difference_base(uint64_t seed, const A& allocator):
33 allocator_(allocator),
34 seed_hash_(compute_seed_hash(seed))
35 {}
36 
37 template<typename EN, typename EK, typename CS, typename A>
38 template<typename FwdSketch, typename Sketch>
39 CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const {
40  if (a.is_empty() || (a.get_num_retained() > 0 && b.is_empty())) return CS(a, ordered);
41  if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
42  if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
43 
44  const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
45  std::vector<EN, A> entries(allocator_);
46  bool is_empty = a.is_empty();
47 
48  if (b.get_num_retained() == 0) {
49  std::copy_if(forward_begin(std::forward<FwdSketch>(a)), forward_end(std::forward<FwdSketch>(a)), std::back_inserter(entries),
50  key_less_than<uint64_t, EN, EK>(theta));
51  } else {
52  if (a.is_ordered() && b.is_ordered()) { // sort-based
53  std::set_difference(forward_begin(std::forward<FwdSketch>(a)), forward_end(std::forward<FwdSketch>(a)), b.begin(), b.end(),
54  conditional_back_inserter(entries, key_less_than<uint64_t, EN, EK>(theta)), comparator());
55  } else { // hash-based
56  const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD);
57  hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 1, 0, 0, allocator_); // theta and seed are not used here
58  for (const auto& entry: b) {
59  const uint64_t hash = EK()(entry);
60  if (hash < theta) {
61  table.insert(table.find(hash).first, hash);
62  } else if (b.is_ordered()) {
63  break; // early stop
64  }
65  }
66 
67  // scan A lookup B
68  for (auto&& entry: a) {
69  const uint64_t hash = EK()(entry);
70  if (hash < theta) {
71  auto result = table.find(hash);
72  if (!result.second) entries.push_back(conditional_forward<FwdSketch>(entry));
73  } else if (a.is_ordered()) {
74  break; // early stop
75  }
76  }
77  }
78  }
79  if (entries.empty() && theta == theta_constants::MAX_THETA) is_empty = true;
80  if (ordered && !a.is_ordered()) std::sort(entries.begin(), entries.end(), comparator());
81  return CS(is_empty, a.is_ordered() || ordered, seed_hash_, theta, std::move(entries));
82 }
83 
84 } /* namespace datasketches */
85 
86 #endif
DataSketches namespace.
Definition: binomial_bounds.hpp:38