Add optimized simultaneous ECDF bands (#2368)

* Add optimized ECDF confidence bands * Test ECDF optimized confidence bands * Default to optimized confidence bands * Use more descriptive names * Support optimized ECDF bands in plots * Test optimized ECDF bands in plots * Run black on new code * Drop unnecessary rvs keyword from examples * Update changelog * Fix or disable linting errors * Speed up optimized bands with numba if available * Make functions work even when jit is disabled * Fix pylint errors * Disable pylint warning * Add heuristic for switching between band methods * Add test for low and high number of draws * Run black
arviz-devs · Aug 28, 2024 · 5a928fb · 5a928fb
1 parent abcc7b4
commit 5a928fb
Show file tree

Hide file tree

Showing 5 changed files with 195 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## v0.x.x Unreleased
 
 ### New features
+- Add optimized simultaneous ECDF confidence bands ([2368](https://github.com/arviz-devs/arviz/pull/2368))
 
 ### Maintenance and fixes
 

diff --git a/arviz/plots/ecdfplot.py b/arviz/plots/ecdfplot.py
@@ -73,6 +73,7 @@ def plot_ecdf(
         - False: No confidence bands are plotted (default).
         - True: Plot bands computed with the default algorithm (subject to change)
         - "pointwise": Compute the pointwise (i.e. marginal) confidence band.
+        - "optimized": Use optimization to estimate a simultaneous confidence band.
         - "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence
           band.
 
@@ -216,8 +217,7 @@ def plot_ecdf(
         >>> pit_vals = distribution.cdf(sample)
         >>> uniform_dist = uniform(0, 1)
         >>> az.plot_ecdf(
-        >>>     pit_vals, cdf=uniform_dist.cdf,
-        >>>     rvs=uniform_dist.rvs, confidence_bands=True
+        >>>     pit_vals, cdf=uniform_dist.cdf, confidence_bands=True,
         >>> )
 
     Plot an ECDF-difference plot of PIT values.
@@ -226,8 +226,8 @@ def plot_ecdf(
         :context: close-figs
 
         >>> az.plot_ecdf(
-        >>>     pit_vals, cdf = uniform_dist.cdf, rvs = uniform_dist.rvs,
-        >>>     confidence_bands = True, difference = True
+        >>>     pit_vals, cdf = uniform_dist.cdf, confidence_bands = True,
+        >>>     difference = True
         >>> )
     """
     if confidence_bands is True:
@@ -238,9 +238,12 @@ def plot_ecdf(
             )
             confidence_bands = "pointwise"
         else:
-            confidence_bands = "simulated"
-    elif confidence_bands == "simulated" and pointwise:
-        raise ValueError("Cannot specify both `confidence_bands='simulated'` and `pointwise=True`")
+            confidence_bands = "auto"
+        # if pointwise specified, confidence_bands must be a bool or 'pointwise'
+    elif confidence_bands not in [False, "pointwise"] and pointwise:
+        raise ValueError(
+            f"Cannot specify both `confidence_bands='{confidence_bands}'` and `pointwise=True`"
+        )
 
     if fpr is not None:
         warnings.warn(
@@ -298,7 +301,7 @@ def plot_ecdf(
             "`eval_points` explicitly.",
             BehaviourChangeWarning,
         )
-        if confidence_bands == "simulated":
+        if confidence_bands in ["optimized", "simulated"]:
             warnings.warn(
                 "For simultaneous bands to be correctly calibrated, specify `eval_points` "
                 "independent of the `values`"
@@ -319,6 +322,11 @@ def plot_ecdf(
 
     if confidence_bands:
         ndraws = len(values)
+        if confidence_bands == "auto":
+            if ndraws < 200 or num_trials >= 250 * np.sqrt(ndraws):
+                confidence_bands = "optimized"
+            else:
+                confidence_bands = "simulated"
         x_bands = eval_points
         lower, higher = ecdf_confidence_band(
             ndraws,

diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py
@@ -1,10 +1,25 @@
 """Functions for evaluating ECDFs and their confidence bands."""
 
+import math
 from typing import Any, Callable, Optional, Tuple
 import warnings
 
 import numpy as np
 from scipy.stats import uniform, binom
+from scipy.optimize import minimize_scalar
+
+try:
+    from numba import jit, vectorize
+except ImportError:
+
+    def jit(*args, **kwargs):  # pylint: disable=unused-argument
+        return lambda f: f
+
+    def vectorize(*args, **kwargs):  # pylint: disable=unused-argument
+        return lambda f: f
+
+
+from ..utils import Numba
 
 
 def compute_ecdf(sample: np.ndarray, eval_points: np.ndarray) -> np.ndarray:
@@ -73,7 +88,7 @@ def ecdf_confidence_band(
     eval_points: np.ndarray,
     cdf_at_eval_points: np.ndarray,
     prob: float = 0.95,
-    method="simulated",
+    method="optimized",
     **kwargs,
 ) -> Tuple[np.ndarray, np.ndarray]:
     """Compute the `prob`-level confidence band for the ECDF.
@@ -92,6 +107,7 @@ def ecdf_confidence_band(
     method : string, default "simulated"
         The method used to compute the confidence band. Valid options are:
         - "pointwise": Compute the pointwise (i.e. marginal) confidence band.
+        - "optimized": Use optimization to estimate a simultaneous confidence band.
         - "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence band.
           `rvs` must be provided.
     rvs: callable, optional
@@ -115,12 +131,18 @@ def ecdf_confidence_band(
 
     if method == "pointwise":
         prob_pointwise = prob
+    elif method == "optimized":
+        prob_pointwise = _optimize_simultaneous_ecdf_band_probability(
+            ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs
+        )
     elif method == "simulated":
         prob_pointwise = _simulate_simultaneous_ecdf_band_probability(
             ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs
         )
     else:
-        raise ValueError(f"Unknown method {method}. Valid options are 'pointwise' or 'simulated'.")
+        raise ValueError(
+            f"Unknown method {method}. Valid options are 'pointwise', 'optimized', or 'simulated'."
+        )
 
     prob_lower, prob_upper = _get_pointwise_confidence_band(
         prob_pointwise, ndraws, cdf_at_eval_points
@@ -129,6 +151,139 @@ def ecdf_confidence_band(
     return prob_lower, prob_upper
 
 
+def _update_ecdf_band_interior_probabilities(
+    prob_left: np.ndarray,
+    interval_left: np.ndarray,
+    interval_right: np.ndarray,
+    p: float,
+    ndraws: int,
+) -> np.ndarray:
+    """Update the probability that an ECDF has been within the envelope including at the current
+    point.
+
+    Arguments
+    ---------
+    prob_left : np.ndarray
+        For each point in the interior at the previous point, the joint probability that it and all
+        points before are in the interior.
+    interval_left : np.ndarray
+        The set of points in the interior at the previous point.
+    interval_right : np.ndarray
+        The set of points in the interior at the current point.
+    p : float
+        The probability of any given point found between the previous point and the current one.
+    ndraws : int
+        Number of draws in the original dataset.
+
+    Returns
+    -------
+    prob_right : np.ndarray
+        For each point in the interior at the current point, the joint probability that it and all
+        previous points are in the interior.
+    """
+    interval_left = interval_left[:, np.newaxis]
+    prob_conditional = binom.pmf(interval_right, ndraws - interval_left, p, loc=interval_left)
+    prob_right = prob_left.dot(prob_conditional)
+    return prob_right
+
+
+@vectorize(["float64(int64, int64, float64, int64)"])
+def _binom_pmf(k, n, p, loc):
+    k -= loc
+    if k < 0 or k > n:
+        return 0.0
+    if p == 0:
+        return 1.0 if k == 0 else 0.0
+    if p == 1:
+        return 1.0 if k == n else 0.0
+    if k == 0:
+        return (1 - p) ** n
+    if k == n:
+        return p**n
+    lbinom = math.lgamma(n + 1) - math.lgamma(k + 1) - math.lgamma(n - k + 1)
+    return np.exp(lbinom + k * np.log(p) + (n - k) * np.log1p(-p))
+
+
+@jit(nopython=True)
+def _update_ecdf_band_interior_probabilities_numba(
+    prob_left: np.ndarray,
+    interval_left: np.ndarray,
+    interval_right: np.ndarray,
+    p: float,
+    ndraws: int,
+) -> np.ndarray:
+    interval_left = interval_left[:, np.newaxis]
+    prob_conditional = _binom_pmf(interval_right, ndraws - interval_left, p, interval_left)
+    prob_right = prob_left.dot(prob_conditional)
+    return prob_right
+
+
+def _ecdf_band_interior_probability(prob_between_points, ndraws, lower_count, upper_count):
+    interval_left = np.arange(1)
+    prob_interior = np.ones(1)
+    for i in range(prob_between_points.shape[0]):
+        interval_right = np.arange(lower_count[i], upper_count[i])
+        prob_interior = _update_ecdf_band_interior_probabilities(
+            prob_interior, interval_left, interval_right, prob_between_points[i], ndraws
+        )
+        interval_left = interval_right
+    return prob_interior.sum()
+
+
+@jit(nopython=True)
+def _ecdf_band_interior_probability_numba(prob_between_points, ndraws, lower_count, upper_count):
+    interval_left = np.arange(1)
+    prob_interior = np.ones(1)
+    for i in range(prob_between_points.shape[0]):
+        interval_right = np.arange(lower_count[i], upper_count[i])
+        prob_interior = _update_ecdf_band_interior_probabilities_numba(
+            prob_interior, interval_left, interval_right, prob_between_points[i], ndraws
+        )
+        interval_left = interval_right
+    return prob_interior.sum()
+
+
+def _ecdf_band_optimization_objective(
+    prob_pointwise: float,
+    cdf_at_eval_points: np.ndarray,
+    ndraws: int,
+    prob_target: float,
+) -> float:
+    """Objective function for optimizing the simultaneous confidence band probability."""
+    lower, upper = _get_pointwise_confidence_band(prob_pointwise, ndraws, cdf_at_eval_points)
+    lower_count = (lower * ndraws).astype(int)
+    upper_count = (upper * ndraws).astype(int) + 1
+    cdf_with_zero = np.insert(cdf_at_eval_points[:-1], 0, 0)
+    prob_between_points = (cdf_at_eval_points - cdf_with_zero) / (1 - cdf_with_zero)
+    if Numba.numba_flag:
+        prob_interior = _ecdf_band_interior_probability_numba(
+            prob_between_points, ndraws, lower_count, upper_count
+        )
+    else:
+        prob_interior = _ecdf_band_interior_probability(
+            prob_between_points, ndraws, lower_count, upper_count
+        )
+    return abs(prob_interior - prob_target)
+
+
+def _optimize_simultaneous_ecdf_band_probability(
+    ndraws: int,
+    eval_points: np.ndarray,  # pylint: disable=unused-argument
+    cdf_at_eval_points: np.ndarray,
+    prob: float = 0.95,
+    **kwargs,  # pylint: disable=unused-argument
+):
+    """Estimate probability for simultaneous confidence band using optimization.
+
+    This function simulates the pointwise probability needed to construct pointwise confidence bands
+    that form a `prob`-level confidence envelope for the ECDF of a sample.
+    """
+    cdf_at_eval_points = np.unique(cdf_at_eval_points)
+    objective = lambda p: _ecdf_band_optimization_objective(p, cdf_at_eval_points, ndraws, prob)
+    prob_pointwise = minimize_scalar(objective, bounds=(prob, 1), method="bounded").x
+    return prob_pointwise
+
+
 def _simulate_simultaneous_ecdf_band_probability(
     ndraws: int,
     eval_points: np.ndarray,

diff --git a/arviz/tests/base_tests/test_plots_matplotlib.py b/arviz/tests/base_tests/test_plots_matplotlib.py
@@ -1285,10 +1285,11 @@ def test_plot_ecdf_eval_points():
     assert axes is not None
 
 
-@pytest.mark.parametrize("confidence_bands", [True, "pointwise", "simulated"])
-def test_plot_ecdf_confidence_bands(confidence_bands):
+@pytest.mark.parametrize("confidence_bands", [True, "pointwise", "optimized", "simulated"])
+@pytest.mark.parametrize("ndraws", [100, 10_000])
+def test_plot_ecdf_confidence_bands(confidence_bands, ndraws):
     """Check that all confidence_bands values correctly accepted"""
-    data = np.random.randn(4, 1000)
+    data = np.random.randn(4, ndraws // 4)
     axes = plot_ecdf(data, confidence_bands=confidence_bands, cdf=norm(0, 1).cdf)
     assert axes is not None
 
@@ -1326,6 +1327,8 @@ def test_plot_ecdf_error():
     # contradictory confidence band types
     with pytest.raises(ValueError):
         plot_ecdf(data, cdf=dist.cdf, confidence_bands="simulated", pointwise=True)
+    with pytest.raises(ValueError):
+        plot_ecdf(data, cdf=dist.cdf, confidence_bands="optimized", pointwise=True)
     plot_ecdf(data, cdf=dist.cdf, confidence_bands=True, pointwise=True)
     plot_ecdf(data, cdf=dist.cdf, confidence_bands="pointwise")
 

diff --git a/arviz/tests/base_tests/test_stats_ecdf_utils.py b/arviz/tests/base_tests/test_stats_ecdf_utils.py
@@ -10,6 +10,13 @@
     _get_pointwise_confidence_band,
 )
 
+try:
+    import numba  # pylint: disable=unused-import
+
+    numba_options = [True, False]
+except ImportError:
+    numba_options = [False]
+
 
 def test_compute_ecdf():
     """Test compute_ecdf function."""
@@ -109,9 +116,15 @@ def test_get_pointwise_confidence_band(dist, prob, ndraws, num_trials=1_000, see
     ids=["continuous", "continuous default rvs", "discrete"],
 )
 @pytest.mark.parametrize("ndraws", [10_000])
-@pytest.mark.parametrize("method", ["pointwise", "simulated"])
-def test_ecdf_confidence_band(dist, rvs, prob, ndraws, method, num_trials=1_000, seed=57):
+@pytest.mark.parametrize("method", ["pointwise", "optimized", "simulated"])
+@pytest.mark.parametrize("use_numba", numba_options)
+def test_ecdf_confidence_band(
+    dist, rvs, prob, ndraws, method, use_numba, num_trials=1_000, seed=57
+):
     """Test test_ecdf_confidence_band."""
+    if use_numba and method != "optimized":
+        pytest.skip("Numba only used in optimized method")
+
     eval_points = np.linspace(*dist.interval(0.99), 10)
     cdf_at_eval_points = dist.cdf(eval_points)
     random_state = np.random.default_rng(seed)