datasketches-cpp
bounds_binomial_proportions.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef _BOUNDS_BINOMIAL_PROPORTIONS_HPP_
21 #define _BOUNDS_BINOMIAL_PROPORTIONS_HPP_
22 
23 #include <cmath>
24 #include <stdexcept>
25 
26 namespace datasketches {
27 
81 class bounds_binomial_proportions { // confidence intervals for binomial proportions
82 
83 public:
113  static inline double approximate_lower_bound_on_p(uint64_t n, uint64_t k, double num_std_devs) {
114  check_inputs(n, k);
115  if (n == 0) { return 0.0; } // the coin was never flipped, so we know nothing
116  else if (k == 0) { return 0.0; }
117  else if (k == 1) { return (exact_lower_bound_on_p_k_eq_1(n, delta_of_num_stdevs(num_std_devs))); }
118  else if (k == n) { return (exact_lower_bound_on_p_k_eq_n(n, delta_of_num_stdevs(num_std_devs))); }
119  else {
120  double x = abramowitz_stegun_formula_26p5p22((n - k) + 1.0, static_cast<double>(k), (-1.0 * num_std_devs));
121  return (1.0 - x); // which is p
122  }
123  }
124 
148  static inline double approximate_upper_bound_on_p(uint64_t n, uint64_t k, double num_std_devs) {
149  check_inputs(n, k);
150  if (n == 0) { return 1.0; } // the coin was never flipped, so we know nothing
151  else if (k == n) { return 1.0; }
152  else if (k == (n - 1)) {
153  return (exact_upper_bound_on_p_k_eq_minusone(n, delta_of_num_stdevs(num_std_devs)));
154  }
155  else if (k == 0) {
156  return (exact_upper_bound_on_p_k_eq_zero(n, delta_of_num_stdevs(num_std_devs)));
157  }
158  else {
159  double x = abramowitz_stegun_formula_26p5p22(static_cast<double>(n - k), k + 1.0, num_std_devs);
160  return (1.0 - x); // which is p
161  }
162  }
163 
170  static inline double estimate_unknown_p(uint64_t n, uint64_t k) {
171  check_inputs(n, k);
172  if (n == 0) { return 0.5; } // the coin was never flipped, so we know nothing
173  else { return ((double) k / (double) n); }
174  }
175 
181  static inline double erf(double x) {
182  if (x < 0.0) { return (-1.0 * (erf_of_nonneg(-1.0 * x))); }
183  else { return (erf_of_nonneg(x)); }
184  }
185 
191  static inline double normal_cdf(double x) {
192  return (0.5 * (1.0 + (erf(x / (sqrt(2.0))))));
193  }
194 
195 private:
196  static inline void check_inputs(uint64_t n, uint64_t k) {
197  if (k > n) { throw std::invalid_argument("K cannot exceed N"); }
198  }
199 
200  //@formatter:off
201  // Abramowitz and Stegun formula 7.1.28, p. 88; Claims accuracy of about 7 decimal digits */
202  static inline double erf_of_nonneg(double x) {
203  // The constants that appear below, formatted for easy checking against the book.
204  // a1 = 0.07052 30784
205  // a3 = 0.00927 05272
206  // a5 = 0.00027 65672
207  // a2 = 0.04228 20123
208  // a4 = 0.00015 20143
209  // a6 = 0.00004 30638
210  static const double a1 = 0.0705230784;
211  static const double a3 = 0.0092705272;
212  static const double a5 = 0.0002765672;
213  static const double a2 = 0.0422820123;
214  static const double a4 = 0.0001520143;
215  static const double a6 = 0.0000430638;
216  const double x2 = x * x; // x squared, x cubed, etc.
217  const double x3 = x2 * x;
218  const double x4 = x2 * x2;
219  const double x5 = x2 * x3;
220  const double x6 = x3 * x3;
221  const double sum = ( 1.0
222  + (a1 * x)
223  + (a2 * x2)
224  + (a3 * x3)
225  + (a4 * x4)
226  + (a5 * x5)
227  + (a6 * x6) );
228  const double sum2 = sum * sum; // raise the sum to the 16th power
229  const double sum4 = sum2 * sum2;
230  const double sum8 = sum4 * sum4;
231  const double sum16 = sum8 * sum8;
232  return (1.0 - (1.0 / sum16));
233  }
234 
235  static inline double delta_of_num_stdevs(double kappa) {
236  return (normal_cdf(-1.0 * kappa));
237  }
238 
239  //@formatter:on
240  // Formula 26.5.22 on page 945 of Abramowitz & Stegun, which is an approximation
241  // of the inverse of the incomplete beta function I_x(a,b) = delta
242  // viewed as a scalar function of x.
243  // In other words, we specify delta, and it gives us x (with a and b held constant).
244  // However, delta is specified in an indirect way through yp which
245  // is the number of stdDevs that leaves delta probability in the right
246  // tail of a standard gaussian distribution.
247 
248  // We point out that the variable names correspond to those in the book,
249  // and it is worth keeping it that way so that it will always be easy to verify
250  // that the formula was typed in correctly.
251 
252  static inline double abramowitz_stegun_formula_26p5p22(double a, double b, double yp) {
253  const double b2m1 = (2.0 * b) - 1.0;
254  const double a2m1 = (2.0 * a) - 1.0;
255  const double lambda = ((yp * yp) - 3.0) / 6.0;
256  const double htmp = (1.0 / a2m1) + (1.0 / b2m1);
257  const double h = 2.0 / htmp;
258  const double term1 = (yp * (sqrt(h + lambda))) / h;
259  const double term2 = (1.0 / b2m1) - (1.0 / a2m1);
260  const double term3 = (lambda + (5.0 / 6.0)) - (2.0 / (3.0 * h));
261  const double w = term1 - (term2 * term3);
262  const double xp = a / (a + (b * (exp(2.0 * w))));
263  return xp;
264  }
265 
266  // Formulas for some special cases.
267 
268  static inline double exact_upper_bound_on_p_k_eq_zero(uint64_t n, double delta) {
269  return (1.0 - pow(delta, (1.0 / n)));
270  }
271 
272  static inline double exact_lower_bound_on_p_k_eq_n(uint64_t n, double delta) {
273  return (pow(delta, (1.0 / n)));
274  }
275 
276  static inline double exact_lower_bound_on_p_k_eq_1(uint64_t n, double delta) {
277  return (1.0 - pow((1.0 - delta), (1.0 / n)));
278  }
279 
280  static inline double exact_upper_bound_on_p_k_eq_minusone(uint64_t n, double delta) {
281  return (pow((1.0 - delta), (1.0 / n)));
282  }
283 
284 };
285 
286 }
287 
288 #endif // _BOUNDS_BINOMIAL_PROPORTIONS_HPP_
Confidence intervals for binomial proportions.
Definition: bounds_binomial_proportions.hpp:81
static double estimate_unknown_p(uint64_t n, uint64_t k)
Computes an estimate of an unknown binomial proportion.
Definition: bounds_binomial_proportions.hpp:170
static double approximate_upper_bound_on_p(uint64_t n, uint64_t k, double num_std_devs)
Computes upper bound of approximate Clopper-Pearson confidence interval for a binomial proportion.
Definition: bounds_binomial_proportions.hpp:148
static double normal_cdf(double x)
Computes an approximation to normal_cdf(x).
Definition: bounds_binomial_proportions.hpp:191
static double erf(double x)
Computes an approximation to the erf() function.
Definition: bounds_binomial_proportions.hpp:181
static double approximate_lower_bound_on_p(uint64_t n, uint64_t k, double num_std_devs)
Computes lower bound of approximate Clopper-Pearson confidence interval for a binomial proportion.
Definition: bounds_binomial_proportions.hpp:113
DataSketches namespace.
Definition: binomial_bounds.hpp:38