datasketches-cpp
Loading...
Searching...
No Matches
kolmogorov_smirnov_impl.hpp
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
21#define KOLMOGOROV_SMIRNOV_IMPL_HPP_
22
23#include <cmath>
24#include <algorithm>
25
26namespace datasketches {
27
28template<typename Sketch>
29double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
30 auto comparator = sketch1.get_comparator(); // assuming the same comparator in sketch2
31 auto view1 = sketch1.get_sorted_view();
32 auto view2 = sketch2.get_sorted_view();
33 auto it1 = view1.begin();
34 auto it2 = view2.begin();
35 const auto n1 = sketch1.get_n();
36 const auto n2 = sketch2.get_n();
37 double delta = 0;
38 while (it1 != view1.end() && it2 != view2.end()) {
39 const double norm_cum_wt1 = static_cast<double>(it1.get_cumulative_weight(false)) / n1;
40 const double norm_cum_wt2 = static_cast<double>(it2.get_cumulative_weight(false)) / n2;
41 delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
42 if (comparator((*it1).first, (*it2).first)) {
43 ++it1;
44 } else if (comparator((*it2).first, (*it1).first)) {
45 ++it2;
46 } else {
47 ++it1;
48 ++it2;
49 }
50 }
51 const double norm_cum_wt1 = it1 == view1.end() ? 1 : static_cast<double>(it1.get_cumulative_weight(false)) / n1;
52 const double norm_cum_wt2 = it2 == view2.end() ? 1 : static_cast<double>(it2.get_cumulative_weight(false)) / n2;
53 delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
54 return delta;
55}
56
57template<typename Sketch>
58double kolmogorov_smirnov::threshold(const Sketch& sketch1, const Sketch& sketch2, double p) {
59 const double r1 = sketch1.get_num_retained();
60 const double r2 = sketch2.get_num_retained();
61 const double alpha_factor = sqrt(-0.5 * log(0.5 * p));
62 const double delta_area_threshold = alpha_factor * sqrt((r1 + r2) / (r1 * r2));
63 const double eps1 = sketch1.get_normalized_rank_error(false);
64 const double eps2 = sketch2.get_normalized_rank_error(false);
65 return delta_area_threshold + eps1 + eps2;
66}
67
68template<typename Sketch>
69bool kolmogorov_smirnov::test(const Sketch& sketch1, const Sketch& sketch2, double p) {
70 return delta(sketch1, sketch2) > threshold(sketch1, sketch2, p);
71}
72
73} /* namespace datasketches */
74
75#endif
static double delta(const Sketch &sketch1, const Sketch &sketch2)
Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
Definition kolmogorov_smirnov_impl.hpp:29
static bool test(const Sketch &sketch1, const Sketch &sketch2, double p)
Performs the Kolmogorov-Smirnov Test between two quantile sketches.
Definition kolmogorov_smirnov_impl.hpp:69
static double threshold(const Sketch &sketch1, const Sketch &sketch2, double p)
Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
Definition kolmogorov_smirnov_impl.hpp:58
DataSketches namespace.
Definition binomial_bounds.hpp:38