datasketches-cpp
kolmogorov_smirnov_impl.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
21 #define KOLMOGOROV_SMIRNOV_IMPL_HPP_
22 
23 #include <cmath>
24 #include <algorithm>
25 
26 namespace datasketches {
27 
28 template<typename Sketch>
29 double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
30  auto comparator = sketch1.get_comparator(); // assuming the same comparator in sketch2
31  auto view1 = sketch1.get_sorted_view();
32  auto view2 = sketch2.get_sorted_view();
33  auto it1 = view1.begin();
34  auto it2 = view2.begin();
35  const auto n1 = sketch1.get_n();
36  const auto n2 = sketch2.get_n();
37  double delta = 0;
38  while (it1 != view1.end() && it2 != view2.end()) {
39  const double norm_cum_wt1 = static_cast<double>(it1.get_cumulative_weight(false)) / n1;
40  const double norm_cum_wt2 = static_cast<double>(it2.get_cumulative_weight(false)) / n2;
41  delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
42  if (comparator((*it1).first, (*it2).first)) {
43  ++it1;
44  } else if (comparator((*it2).first, (*it1).first)) {
45  ++it2;
46  } else {
47  ++it1;
48  ++it2;
49  }
50  }
51  const double norm_cum_wt1 = it1 == view1.end() ? 1 : static_cast<double>(it1.get_cumulative_weight(false)) / n1;
52  const double norm_cum_wt2 = it2 == view2.end() ? 1 : static_cast<double>(it2.get_cumulative_weight(false)) / n2;
53  delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
54  return delta;
55 }
56 
57 template<typename Sketch>
58 double kolmogorov_smirnov::threshold(const Sketch& sketch1, const Sketch& sketch2, double p) {
59  const double r1 = sketch1.get_num_retained();
60  const double r2 = sketch2.get_num_retained();
61  const double alpha_factor = sqrt(-0.5 * log(0.5 * p));
62  const double delta_area_threshold = alpha_factor * sqrt((r1 + r2) / (r1 * r2));
63  const double eps1 = sketch1.get_normalized_rank_error(false);
64  const double eps2 = sketch2.get_normalized_rank_error(false);
65  return delta_area_threshold + eps1 + eps2;
66 }
67 
68 template<typename Sketch>
69 bool kolmogorov_smirnov::test(const Sketch& sketch1, const Sketch& sketch2, double p) {
70  return delta(sketch1, sketch2) > threshold(sketch1, sketch2, p);
71 }
72 
73 } /* namespace datasketches */
74 
75 #endif
static double delta(const Sketch &sketch1, const Sketch &sketch2)
Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
Definition: kolmogorov_smirnov_impl.hpp:29
static bool test(const Sketch &sketch1, const Sketch &sketch2, double p)
Performs the Kolmogorov-Smirnov Test between two quantile sketches.
Definition: kolmogorov_smirnov_impl.hpp:69
static double threshold(const Sketch &sketch1, const Sketch &sketch2, double p)
Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
Definition: kolmogorov_smirnov_impl.hpp:58
DataSketches namespace.
Definition: binomial_bounds.hpp:38