20 #ifndef THETA_JACCARD_SIMILARITY_BASE_HPP_
21 #define THETA_JACCARD_SIMILARITY_BASE_HPP_
26 #include "theta_constants.hpp"
27 #include "bounds_on_ratios_in_theta_sketched_sets.hpp"
28 #include "ceiling_power_of_2.hpp"
29 #include "common_defs.hpp"
34 template<
typename Union,
typename Intersection,
typename ExtractKey>
54 template<
typename SketchA,
typename SketchB>
55 static std::array<double, 3>
jaccard(
const SketchA& sketch_a,
const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
56 if (
reinterpret_cast<const void*
>(&sketch_a) ==
reinterpret_cast<const void*
>(&sketch_b))
return {1, 1, 1};
57 if (sketch_a.is_empty() && sketch_b.is_empty())
return {1, 1, 1};
58 if (sketch_a.is_empty() || sketch_b.is_empty())
return {0, 0, 0};
60 auto union_ab = compute_union(sketch_a, sketch_b, seed);
61 if (identical_sets(sketch_a, sketch_b, union_ab))
return {1, 1, 1};
68 auto inter_abu = i.get_result(
false);
84 template<
typename SketchA,
typename SketchB>
85 static bool exactly_equal(
const SketchA& sketch_a,
const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
86 if (
reinterpret_cast<const void*
>(&sketch_a) ==
reinterpret_cast<const void*
>(&sketch_b))
return true;
87 if (sketch_a.is_empty() && sketch_b.is_empty())
return true;
88 if (sketch_a.is_empty() || sketch_b.is_empty())
return false;
90 auto union_ab = compute_union(sketch_a, sketch_b, seed);
91 if (identical_sets(sketch_a, sketch_b, union_ab))
return true;
109 template<
typename SketchA,
typename SketchB>
110 static bool similarity_test(
const SketchA& actual,
const SketchB& expected,
double threshold, uint64_t seed = DEFAULT_SEED) {
111 auto jc =
jaccard(actual, expected, seed);
112 return jc[0] >= threshold;
129 template<
typename SketchA,
typename SketchB>
130 static bool dissimilarity_test(
const SketchA& actual,
const SketchB& expected,
double threshold, uint64_t seed = DEFAULT_SEED) {
131 auto jc =
jaccard(actual, expected, seed);
132 return jc[2] <= threshold;
137 template<
typename SketchA,
typename SketchB>
138 static typename Union::CompactSketch compute_union(
const SketchA& sketch_a,
const SketchB& sketch_b, uint64_t seed) {
139 const auto count_a = sketch_a.get_num_retained();
140 const auto count_b = sketch_b.get_num_retained();
142 auto u =
typename Union::builder().set_lg_k(lg_k).set_seed(seed).build();
145 return u.get_result(
false);
148 template<
typename SketchA,
typename SketchB,
typename UnionAB>
149 static bool identical_sets(
const SketchA& sketch_a,
const SketchB& sketch_b,
const UnionAB& union_ab) {
150 if (union_ab.get_num_retained() == sketch_a.get_num_retained() &&
151 union_ab.get_num_retained() == sketch_b.get_num_retained() &&
152 union_ab.get_theta64() == sketch_a.get_theta64() &&
153 union_ab.get_theta64() == sketch_b.get_theta64())
return true;
static double estimate_of_b_over_a(const SketchA &sketch_a, const SketchB &sketch_b)
Gets the estimate for B over A.
Definition: bounds_on_ratios_in_theta_sketched_sets.hpp:103
static double upper_bound_for_b_over_a(const SketchA &sketch_a, const SketchB &sketch_b)
Gets the approximate upper bound for B over A based on a 95% confidence interval.
Definition: bounds_on_ratios_in_theta_sketched_sets.hpp:81
static double lower_bound_for_b_over_a(const SketchA &sketch_a, const SketchB &sketch_b)
Gets the approximate lower bound for B over A based on a 95% confidence interval.
Definition: bounds_on_ratios_in_theta_sketched_sets.hpp:59
Base class for Jaccard similarity.
Definition: theta_jaccard_similarity_base.hpp:35
static bool exactly_equal(const SketchA &sketch_a, const SketchB &sketch_b, uint64_t seed=DEFAULT_SEED)
Returns true if the two given sketches are equivalent.
Definition: theta_jaccard_similarity_base.hpp:85
static bool dissimilarity_test(const SketchA &actual, const SketchB &expected, double threshold, uint64_t seed=DEFAULT_SEED)
Tests dissimilarity of an actual Sketch against an expected Sketch.
Definition: theta_jaccard_similarity_base.hpp:130
static std::array< double, 3 > jaccard(const SketchA &sketch_a, const SketchB &sketch_b, uint64_t seed=DEFAULT_SEED)
Computes the Jaccard similarity index with upper and lower bounds.
Definition: theta_jaccard_similarity_base.hpp:55
static bool similarity_test(const SketchA &actual, const SketchB &expected, double threshold, uint64_t seed=DEFAULT_SEED)
Tests similarity of an actual Sketch against an expected Sketch.
Definition: theta_jaccard_similarity_base.hpp:110
const uint8_t MIN_LG_K
min log2 of K
Definition: theta_constants.hpp:38
const uint8_t MAX_LG_K
max log2 of K
Definition: theta_constants.hpp:40
DataSketches namespace.
Definition: binomial_bounds.hpp:38