Skip to content

Commit

Permalink
Add optimized simultaneous ECDF bands (#2368)
Browse files Browse the repository at this point in the history
* Add optimized ECDF confidence bands

* Test ECDF optimized confidence bands

* Default to optimized confidence bands

* Use more descriptive names

* Support optimized ECDF bands in plots

* Test optimized ECDF bands in plots

* Run black on new code

* Drop unnecessary rvs keyword from examples

* Update changelog

* Fix or disable linting errors

* Speed up optimized bands with numba if available

* Make functions work even when jit is disabled

* Fix pylint errors

* Disable pylint warning

* Add heuristic for switching between band methods

* Add test for low and high number of draws

* Run black
  • Loading branch information
sethaxen authored Aug 28, 2024
1 parent abcc7b4 commit 5a928fb
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 15 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## v0.x.x Unreleased

### New features
- Add optimized simultaneous ECDF confidence bands ([2368](https://github.com/arviz-devs/arviz/pull/2368))

### Maintenance and fixes

Expand Down
24 changes: 16 additions & 8 deletions arviz/plots/ecdfplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def plot_ecdf(
- False: No confidence bands are plotted (default).
- True: Plot bands computed with the default algorithm (subject to change)
- "pointwise": Compute the pointwise (i.e. marginal) confidence band.
- "optimized": Use optimization to estimate a simultaneous confidence band.
- "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence
band.
Expand Down Expand Up @@ -216,8 +217,7 @@ def plot_ecdf(
>>> pit_vals = distribution.cdf(sample)
>>> uniform_dist = uniform(0, 1)
>>> az.plot_ecdf(
>>> pit_vals, cdf=uniform_dist.cdf,
>>> rvs=uniform_dist.rvs, confidence_bands=True
>>> pit_vals, cdf=uniform_dist.cdf, confidence_bands=True,
>>> )
Plot an ECDF-difference plot of PIT values.
Expand All @@ -226,8 +226,8 @@ def plot_ecdf(
:context: close-figs
>>> az.plot_ecdf(
>>> pit_vals, cdf = uniform_dist.cdf, rvs = uniform_dist.rvs,
>>> confidence_bands = True, difference = True
>>> pit_vals, cdf = uniform_dist.cdf, confidence_bands = True,
>>> difference = True
>>> )
"""
if confidence_bands is True:
Expand All @@ -238,9 +238,12 @@ def plot_ecdf(
)
confidence_bands = "pointwise"
else:
confidence_bands = "simulated"
elif confidence_bands == "simulated" and pointwise:
raise ValueError("Cannot specify both `confidence_bands='simulated'` and `pointwise=True`")
confidence_bands = "auto"
# if pointwise specified, confidence_bands must be a bool or 'pointwise'
elif confidence_bands not in [False, "pointwise"] and pointwise:
raise ValueError(
f"Cannot specify both `confidence_bands='{confidence_bands}'` and `pointwise=True`"
)

if fpr is not None:
warnings.warn(
Expand Down Expand Up @@ -298,7 +301,7 @@ def plot_ecdf(
"`eval_points` explicitly.",
BehaviourChangeWarning,
)
if confidence_bands == "simulated":
if confidence_bands in ["optimized", "simulated"]:
warnings.warn(
"For simultaneous bands to be correctly calibrated, specify `eval_points` "
"independent of the `values`"
Expand All @@ -319,6 +322,11 @@ def plot_ecdf(

if confidence_bands:
ndraws = len(values)
if confidence_bands == "auto":
if ndraws < 200 or num_trials >= 250 * np.sqrt(ndraws):
confidence_bands = "optimized"
else:
confidence_bands = "simulated"
x_bands = eval_points
lower, higher = ecdf_confidence_band(
ndraws,
Expand Down
159 changes: 157 additions & 2 deletions arviz/stats/ecdf_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,25 @@
"""Functions for evaluating ECDFs and their confidence bands."""

import math
from typing import Any, Callable, Optional, Tuple
import warnings

import numpy as np
from scipy.stats import uniform, binom
from scipy.optimize import minimize_scalar

try:
from numba import jit, vectorize
except ImportError:

def jit(*args, **kwargs): # pylint: disable=unused-argument
return lambda f: f

def vectorize(*args, **kwargs): # pylint: disable=unused-argument
return lambda f: f


from ..utils import Numba


def compute_ecdf(sample: np.ndarray, eval_points: np.ndarray) -> np.ndarray:
Expand Down Expand Up @@ -73,7 +88,7 @@ def ecdf_confidence_band(
eval_points: np.ndarray,
cdf_at_eval_points: np.ndarray,
prob: float = 0.95,
method="simulated",
method="optimized",
**kwargs,
) -> Tuple[np.ndarray, np.ndarray]:
"""Compute the `prob`-level confidence band for the ECDF.
Expand All @@ -92,6 +107,7 @@ def ecdf_confidence_band(
method : string, default "simulated"
The method used to compute the confidence band. Valid options are:
- "pointwise": Compute the pointwise (i.e. marginal) confidence band.
- "optimized": Use optimization to estimate a simultaneous confidence band.
- "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence band.
`rvs` must be provided.
rvs: callable, optional
Expand All @@ -115,12 +131,18 @@ def ecdf_confidence_band(

if method == "pointwise":
prob_pointwise = prob
elif method == "optimized":
prob_pointwise = _optimize_simultaneous_ecdf_band_probability(
ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs
)
elif method == "simulated":
prob_pointwise = _simulate_simultaneous_ecdf_band_probability(
ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs
)
else:
raise ValueError(f"Unknown method {method}. Valid options are 'pointwise' or 'simulated'.")
raise ValueError(
f"Unknown method {method}. Valid options are 'pointwise', 'optimized', or 'simulated'."
)

prob_lower, prob_upper = _get_pointwise_confidence_band(
prob_pointwise, ndraws, cdf_at_eval_points
Expand All @@ -129,6 +151,139 @@ def ecdf_confidence_band(
return prob_lower, prob_upper


def _update_ecdf_band_interior_probabilities(
prob_left: np.ndarray,
interval_left: np.ndarray,
interval_right: np.ndarray,
p: float,
ndraws: int,
) -> np.ndarray:
"""Update the probability that an ECDF has been within the envelope including at the current
point.
Arguments
---------
prob_left : np.ndarray
For each point in the interior at the previous point, the joint probability that it and all
points before are in the interior.
interval_left : np.ndarray
The set of points in the interior at the previous point.
interval_right : np.ndarray
The set of points in the interior at the current point.
p : float
The probability of any given point found between the previous point and the current one.
ndraws : int
Number of draws in the original dataset.
Returns
-------
prob_right : np.ndarray
For each point in the interior at the current point, the joint probability that it and all
previous points are in the interior.
"""
interval_left = interval_left[:, np.newaxis]
prob_conditional = binom.pmf(interval_right, ndraws - interval_left, p, loc=interval_left)
prob_right = prob_left.dot(prob_conditional)
return prob_right


@vectorize(["float64(int64, int64, float64, int64)"])
def _binom_pmf(k, n, p, loc):
k -= loc
if k < 0 or k > n:
return 0.0
if p == 0:
return 1.0 if k == 0 else 0.0
if p == 1:
return 1.0 if k == n else 0.0
if k == 0:
return (1 - p) ** n
if k == n:
return p**n
lbinom = math.lgamma(n + 1) - math.lgamma(k + 1) - math.lgamma(n - k + 1)
return np.exp(lbinom + k * np.log(p) + (n - k) * np.log1p(-p))


@jit(nopython=True)
def _update_ecdf_band_interior_probabilities_numba(
prob_left: np.ndarray,
interval_left: np.ndarray,
interval_right: np.ndarray,
p: float,
ndraws: int,
) -> np.ndarray:
interval_left = interval_left[:, np.newaxis]
prob_conditional = _binom_pmf(interval_right, ndraws - interval_left, p, interval_left)
prob_right = prob_left.dot(prob_conditional)
return prob_right


def _ecdf_band_interior_probability(prob_between_points, ndraws, lower_count, upper_count):
interval_left = np.arange(1)
prob_interior = np.ones(1)
for i in range(prob_between_points.shape[0]):
interval_right = np.arange(lower_count[i], upper_count[i])
prob_interior = _update_ecdf_band_interior_probabilities(
prob_interior, interval_left, interval_right, prob_between_points[i], ndraws
)
interval_left = interval_right
return prob_interior.sum()


@jit(nopython=True)
def _ecdf_band_interior_probability_numba(prob_between_points, ndraws, lower_count, upper_count):
interval_left = np.arange(1)
prob_interior = np.ones(1)
for i in range(prob_between_points.shape[0]):
interval_right = np.arange(lower_count[i], upper_count[i])
prob_interior = _update_ecdf_band_interior_probabilities_numba(
prob_interior, interval_left, interval_right, prob_between_points[i], ndraws
)
interval_left = interval_right
return prob_interior.sum()


def _ecdf_band_optimization_objective(
prob_pointwise: float,
cdf_at_eval_points: np.ndarray,
ndraws: int,
prob_target: float,
) -> float:
"""Objective function for optimizing the simultaneous confidence band probability."""
lower, upper = _get_pointwise_confidence_band(prob_pointwise, ndraws, cdf_at_eval_points)
lower_count = (lower * ndraws).astype(int)
upper_count = (upper * ndraws).astype(int) + 1
cdf_with_zero = np.insert(cdf_at_eval_points[:-1], 0, 0)
prob_between_points = (cdf_at_eval_points - cdf_with_zero) / (1 - cdf_with_zero)
if Numba.numba_flag:
prob_interior = _ecdf_band_interior_probability_numba(
prob_between_points, ndraws, lower_count, upper_count
)
else:
prob_interior = _ecdf_band_interior_probability(
prob_between_points, ndraws, lower_count, upper_count
)
return abs(prob_interior - prob_target)


def _optimize_simultaneous_ecdf_band_probability(
ndraws: int,
eval_points: np.ndarray, # pylint: disable=unused-argument
cdf_at_eval_points: np.ndarray,
prob: float = 0.95,
**kwargs, # pylint: disable=unused-argument
):
"""Estimate probability for simultaneous confidence band using optimization.
This function simulates the pointwise probability needed to construct pointwise confidence bands
that form a `prob`-level confidence envelope for the ECDF of a sample.
"""
cdf_at_eval_points = np.unique(cdf_at_eval_points)
objective = lambda p: _ecdf_band_optimization_objective(p, cdf_at_eval_points, ndraws, prob)
prob_pointwise = minimize_scalar(objective, bounds=(prob, 1), method="bounded").x
return prob_pointwise


def _simulate_simultaneous_ecdf_band_probability(
ndraws: int,
eval_points: np.ndarray,
Expand Down
9 changes: 6 additions & 3 deletions arviz/tests/base_tests/test_plots_matplotlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -1285,10 +1285,11 @@ def test_plot_ecdf_eval_points():
assert axes is not None


@pytest.mark.parametrize("confidence_bands", [True, "pointwise", "simulated"])
def test_plot_ecdf_confidence_bands(confidence_bands):
@pytest.mark.parametrize("confidence_bands", [True, "pointwise", "optimized", "simulated"])
@pytest.mark.parametrize("ndraws", [100, 10_000])
def test_plot_ecdf_confidence_bands(confidence_bands, ndraws):
"""Check that all confidence_bands values correctly accepted"""
data = np.random.randn(4, 1000)
data = np.random.randn(4, ndraws // 4)
axes = plot_ecdf(data, confidence_bands=confidence_bands, cdf=norm(0, 1).cdf)
assert axes is not None

Expand Down Expand Up @@ -1326,6 +1327,8 @@ def test_plot_ecdf_error():
# contradictory confidence band types
with pytest.raises(ValueError):
plot_ecdf(data, cdf=dist.cdf, confidence_bands="simulated", pointwise=True)
with pytest.raises(ValueError):
plot_ecdf(data, cdf=dist.cdf, confidence_bands="optimized", pointwise=True)
plot_ecdf(data, cdf=dist.cdf, confidence_bands=True, pointwise=True)
plot_ecdf(data, cdf=dist.cdf, confidence_bands="pointwise")

Expand Down
17 changes: 15 additions & 2 deletions arviz/tests/base_tests/test_stats_ecdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@
_get_pointwise_confidence_band,
)

try:
import numba # pylint: disable=unused-import

numba_options = [True, False]
except ImportError:
numba_options = [False]


def test_compute_ecdf():
"""Test compute_ecdf function."""
Expand Down Expand Up @@ -109,9 +116,15 @@ def test_get_pointwise_confidence_band(dist, prob, ndraws, num_trials=1_000, see
ids=["continuous", "continuous default rvs", "discrete"],
)
@pytest.mark.parametrize("ndraws", [10_000])
@pytest.mark.parametrize("method", ["pointwise", "simulated"])
def test_ecdf_confidence_band(dist, rvs, prob, ndraws, method, num_trials=1_000, seed=57):
@pytest.mark.parametrize("method", ["pointwise", "optimized", "simulated"])
@pytest.mark.parametrize("use_numba", numba_options)
def test_ecdf_confidence_band(
dist, rvs, prob, ndraws, method, use_numba, num_trials=1_000, seed=57
):
"""Test test_ecdf_confidence_band."""
if use_numba and method != "optimized":
pytest.skip("Numba only used in optimized method")

eval_points = np.linspace(*dist.interval(0.99), 10)
cdf_at_eval_points = dist.cdf(eval_points)
random_state = np.random.default_rng(seed)
Expand Down

0 comments on commit 5a928fb

Please sign in to comment.