datasketches-cpp
Loading...
Searching...
No Matches
theta_jaccard_similarity_base.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef THETA_JACCARD_SIMILARITY_BASE_HPP_
21#define THETA_JACCARD_SIMILARITY_BASE_HPP_
22
23#include <memory>
24#include <array>
25
26#include "theta_constants.hpp"
27#include "bounds_on_ratios_in_theta_sketched_sets.hpp"
28#include "ceiling_power_of_2.hpp"
29#include "common_defs.hpp"
30
31namespace datasketches {
32
34template<typename Union, typename Intersection, typename ExtractKey>
36public:
37
54 template<typename SketchA, typename SketchB>
55 static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
56 if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return {1, 1, 1};
57 if (sketch_a.is_empty() && sketch_b.is_empty()) return {1, 1, 1};
58 if (sketch_a.is_empty() || sketch_b.is_empty()) return {0, 0, 0};
59
60 auto union_ab = compute_union(sketch_a, sketch_b, seed);
61 if (identical_sets(sketch_a, sketch_b, union_ab)) return {1, 1, 1};
62
63 // intersection
64 Intersection i(seed);
65 i.update(sketch_a);
66 i.update(sketch_b);
67 i.update(union_ab); // ensures that intersection is a subset of the union
68 auto inter_abu = i.get_result(false);
69
70 return {
74 };
75 }
76
84 template<typename SketchA, typename SketchB>
85 static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
86 if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return true;
87 if (sketch_a.is_empty() && sketch_b.is_empty()) return true;
88 if (sketch_a.is_empty() || sketch_b.is_empty()) return false;
89
90 auto union_ab = compute_union(sketch_a, sketch_b, seed);
91 if (identical_sets(sketch_a, sketch_b, union_ab)) return true;
92 return false;
93 }
94
109 template<typename SketchA, typename SketchB>
110 static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
111 auto jc = jaccard(actual, expected, seed);
112 return jc[0] >= threshold;
113 }
114
129 template<typename SketchA, typename SketchB>
130 static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
131 auto jc = jaccard(actual, expected, seed);
132 return jc[2] <= threshold;
133 }
134
135private:
136
137 template<typename SketchA, typename SketchB>
138 static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed) {
139 const auto count_a = sketch_a.get_num_retained();
140 const auto count_b = sketch_b.get_num_retained();
141 const uint8_t lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
142 auto u = typename Union::builder().set_lg_k(lg_k).set_seed(seed).build();
143 u.update(sketch_a);
144 u.update(sketch_b);
145 return u.get_result(false);
146 }
147
148 template<typename SketchA, typename SketchB, typename UnionAB>
149 static bool identical_sets(const SketchA& sketch_a, const SketchB& sketch_b, const UnionAB& union_ab) {
150 if (union_ab.get_num_retained() == sketch_a.get_num_retained() &&
151 union_ab.get_num_retained() == sketch_b.get_num_retained() &&
152 union_ab.get_theta64() == sketch_a.get_theta64() &&
153 union_ab.get_theta64() == sketch_b.get_theta64()) return true;
154 return false;
155 }
156
157};
158
159} /* namespace datasketches */
160
161# endif
static double estimate_of_b_over_a(const SketchA &sketch_a, const SketchB &sketch_b)
Gets the estimate for B over A.
Definition bounds_on_ratios_in_theta_sketched_sets.hpp:103
static double upper_bound_for_b_over_a(const SketchA &sketch_a, const SketchB &sketch_b)
Gets the approximate upper bound for B over A based on a 95% confidence interval.
Definition bounds_on_ratios_in_theta_sketched_sets.hpp:81
static double lower_bound_for_b_over_a(const SketchA &sketch_a, const SketchB &sketch_b)
Gets the approximate lower bound for B over A based on a 95% confidence interval.
Definition bounds_on_ratios_in_theta_sketched_sets.hpp:59
Base class for Jaccard similarity.
Definition theta_jaccard_similarity_base.hpp:35
static bool exactly_equal(const SketchA &sketch_a, const SketchB &sketch_b, uint64_t seed=DEFAULT_SEED)
Returns true if the two given sketches are equivalent.
Definition theta_jaccard_similarity_base.hpp:85
static bool dissimilarity_test(const SketchA &actual, const SketchB &expected, double threshold, uint64_t seed=DEFAULT_SEED)
Tests dissimilarity of an actual Sketch against an expected Sketch.
Definition theta_jaccard_similarity_base.hpp:130
static std::array< double, 3 > jaccard(const SketchA &sketch_a, const SketchB &sketch_b, uint64_t seed=DEFAULT_SEED)
Computes the Jaccard similarity index with upper and lower bounds.
Definition theta_jaccard_similarity_base.hpp:55
static bool similarity_test(const SketchA &actual, const SketchB &expected, double threshold, uint64_t seed=DEFAULT_SEED)
Tests similarity of an actual Sketch against an expected Sketch.
Definition theta_jaccard_similarity_base.hpp:110
const uint8_t MIN_LG_K
min log2 of K
Definition theta_constants.hpp:38
const uint8_t MAX_LG_K
max log2 of K
Definition theta_constants.hpp:40
DataSketches namespace.
Definition binomial_bounds.hpp:38