datasketches-cpp
theta_jaccard_similarity_base.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef THETA_JACCARD_SIMILARITY_BASE_HPP_
21 #define THETA_JACCARD_SIMILARITY_BASE_HPP_
22 
23 #include <memory>
24 #include <array>
25 
26 #include "theta_constants.hpp"
27 #include "bounds_on_ratios_in_theta_sketched_sets.hpp"
28 #include "ceiling_power_of_2.hpp"
29 #include "common_defs.hpp"
30 
31 namespace datasketches {
32 
34 template<typename Union, typename Intersection, typename ExtractKey>
36 public:
37 
54  template<typename SketchA, typename SketchB>
55  static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
56  if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return {1, 1, 1};
57  if (sketch_a.is_empty() && sketch_b.is_empty()) return {1, 1, 1};
58  if (sketch_a.is_empty() || sketch_b.is_empty()) return {0, 0, 0};
59 
60  auto union_ab = compute_union(sketch_a, sketch_b, seed);
61  if (identical_sets(sketch_a, sketch_b, union_ab)) return {1, 1, 1};
62 
63  // intersection
64  Intersection i(seed);
65  i.update(sketch_a);
66  i.update(sketch_b);
67  i.update(union_ab); // ensures that intersection is a subset of the union
68  auto inter_abu = i.get_result(false);
69 
70  return {
74  };
75  }
76 
84  template<typename SketchA, typename SketchB>
85  static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
86  if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return true;
87  if (sketch_a.is_empty() && sketch_b.is_empty()) return true;
88  if (sketch_a.is_empty() || sketch_b.is_empty()) return false;
89 
90  auto union_ab = compute_union(sketch_a, sketch_b, seed);
91  if (identical_sets(sketch_a, sketch_b, union_ab)) return true;
92  return false;
93  }
94 
109  template<typename SketchA, typename SketchB>
110  static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
111  auto jc = jaccard(actual, expected, seed);
112  return jc[0] >= threshold;
113  }
114 
129  template<typename SketchA, typename SketchB>
130  static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
131  auto jc = jaccard(actual, expected, seed);
132  return jc[2] <= threshold;
133  }
134 
135 private:
136 
137  template<typename SketchA, typename SketchB>
138  static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed) {
139  const auto count_a = sketch_a.get_num_retained();
140  const auto count_b = sketch_b.get_num_retained();
141  const uint8_t lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
142  auto u = typename Union::builder().set_lg_k(lg_k).set_seed(seed).build();
143  u.update(sketch_a);
144  u.update(sketch_b);
145  return u.get_result(false);
146  }
147 
148  template<typename SketchA, typename SketchB, typename UnionAB>
149  static bool identical_sets(const SketchA& sketch_a, const SketchB& sketch_b, const UnionAB& union_ab) {
150  if (union_ab.get_num_retained() == sketch_a.get_num_retained() &&
151  union_ab.get_num_retained() == sketch_b.get_num_retained() &&
152  union_ab.get_theta64() == sketch_a.get_theta64() &&
153  union_ab.get_theta64() == sketch_b.get_theta64()) return true;
154  return false;
155  }
156 
157 };
158 
159 } /* namespace datasketches */
160 
161 # endif
static double estimate_of_b_over_a(const SketchA &sketch_a, const SketchB &sketch_b)
Gets the estimate for B over A.
Definition: bounds_on_ratios_in_theta_sketched_sets.hpp:103
static double upper_bound_for_b_over_a(const SketchA &sketch_a, const SketchB &sketch_b)
Gets the approximate upper bound for B over A based on a 95% confidence interval.
Definition: bounds_on_ratios_in_theta_sketched_sets.hpp:81
static double lower_bound_for_b_over_a(const SketchA &sketch_a, const SketchB &sketch_b)
Gets the approximate lower bound for B over A based on a 95% confidence interval.
Definition: bounds_on_ratios_in_theta_sketched_sets.hpp:59
Base class for Jaccard similarity.
Definition: theta_jaccard_similarity_base.hpp:35
static bool exactly_equal(const SketchA &sketch_a, const SketchB &sketch_b, uint64_t seed=DEFAULT_SEED)
Returns true if the two given sketches are equivalent.
Definition: theta_jaccard_similarity_base.hpp:85
static bool dissimilarity_test(const SketchA &actual, const SketchB &expected, double threshold, uint64_t seed=DEFAULT_SEED)
Tests dissimilarity of an actual Sketch against an expected Sketch.
Definition: theta_jaccard_similarity_base.hpp:130
static std::array< double, 3 > jaccard(const SketchA &sketch_a, const SketchB &sketch_b, uint64_t seed=DEFAULT_SEED)
Computes the Jaccard similarity index with upper and lower bounds.
Definition: theta_jaccard_similarity_base.hpp:55
static bool similarity_test(const SketchA &actual, const SketchB &expected, double threshold, uint64_t seed=DEFAULT_SEED)
Tests similarity of an actual Sketch against an expected Sketch.
Definition: theta_jaccard_similarity_base.hpp:110
const uint8_t MIN_LG_K
min log2 of K
Definition: theta_constants.hpp:38
const uint8_t MAX_LG_K
max log2 of K
Definition: theta_constants.hpp:40
DataSketches namespace.
Definition: binomial_bounds.hpp:38