diff --git a/CHANGELOG.md b/CHANGELOG.md index 942461eb91..da23f508dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## v0.x.x Unreleased ### New features +- Add optimized simultaneous ECDF confidence bands ([2368](https://github.com/arviz-devs/arviz/pull/2368)) ### Maintenance and fixes diff --git a/arviz/plots/ecdfplot.py b/arviz/plots/ecdfplot.py index 9fe136c718..bc92f48e5f 100644 --- a/arviz/plots/ecdfplot.py +++ b/arviz/plots/ecdfplot.py @@ -73,6 +73,7 @@ def plot_ecdf( - False: No confidence bands are plotted (default). - True: Plot bands computed with the default algorithm (subject to change) - "pointwise": Compute the pointwise (i.e. marginal) confidence band. + - "optimized": Use optimization to estimate a simultaneous confidence band. - "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence band. @@ -216,8 +217,7 @@ def plot_ecdf( >>> pit_vals = distribution.cdf(sample) >>> uniform_dist = uniform(0, 1) >>> az.plot_ecdf( - >>> pit_vals, cdf=uniform_dist.cdf, - >>> rvs=uniform_dist.rvs, confidence_bands=True + >>> pit_vals, cdf=uniform_dist.cdf, confidence_bands=True, >>> ) Plot an ECDF-difference plot of PIT values. @@ -226,8 +226,8 @@ def plot_ecdf( :context: close-figs >>> az.plot_ecdf( - >>> pit_vals, cdf = uniform_dist.cdf, rvs = uniform_dist.rvs, - >>> confidence_bands = True, difference = True + >>> pit_vals, cdf = uniform_dist.cdf, confidence_bands = True, + >>> difference = True >>> ) """ if confidence_bands is True: @@ -238,9 +238,12 @@ def plot_ecdf( ) confidence_bands = "pointwise" else: - confidence_bands = "simulated" - elif confidence_bands == "simulated" and pointwise: - raise ValueError("Cannot specify both `confidence_bands='simulated'` and `pointwise=True`") + confidence_bands = "auto" + # if pointwise specified, confidence_bands must be a bool or 'pointwise' + elif confidence_bands not in [False, "pointwise"] and pointwise: + raise ValueError( + f"Cannot specify both `confidence_bands='{confidence_bands}'` and `pointwise=True`" + ) if fpr is not None: warnings.warn( @@ -298,7 +301,7 @@ def plot_ecdf( "`eval_points` explicitly.", BehaviourChangeWarning, ) - if confidence_bands == "simulated": + if confidence_bands in ["optimized", "simulated"]: warnings.warn( "For simultaneous bands to be correctly calibrated, specify `eval_points` " "independent of the `values`" @@ -319,6 +322,11 @@ def plot_ecdf( if confidence_bands: ndraws = len(values) + if confidence_bands == "auto": + if ndraws < 200 or num_trials >= 250 * np.sqrt(ndraws): + confidence_bands = "optimized" + else: + confidence_bands = "simulated" x_bands = eval_points lower, higher = ecdf_confidence_band( ndraws, diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py index 66658d659c..1985446c68 100644 --- a/arviz/stats/ecdf_utils.py +++ b/arviz/stats/ecdf_utils.py @@ -1,10 +1,25 @@ """Functions for evaluating ECDFs and their confidence bands.""" +import math from typing import Any, Callable, Optional, Tuple import warnings import numpy as np from scipy.stats import uniform, binom +from scipy.optimize import minimize_scalar + +try: + from numba import jit, vectorize +except ImportError: + + def jit(*args, **kwargs): # pylint: disable=unused-argument + return lambda f: f + + def vectorize(*args, **kwargs): # pylint: disable=unused-argument + return lambda f: f + + +from ..utils import Numba def compute_ecdf(sample: np.ndarray, eval_points: np.ndarray) -> np.ndarray: @@ -73,7 +88,7 @@ def ecdf_confidence_band( eval_points: np.ndarray, cdf_at_eval_points: np.ndarray, prob: float = 0.95, - method="simulated", + method="optimized", **kwargs, ) -> Tuple[np.ndarray, np.ndarray]: """Compute the `prob`-level confidence band for the ECDF. @@ -92,6 +107,7 @@ def ecdf_confidence_band( method : string, default "simulated" The method used to compute the confidence band. Valid options are: - "pointwise": Compute the pointwise (i.e. marginal) confidence band. + - "optimized": Use optimization to estimate a simultaneous confidence band. - "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence band. `rvs` must be provided. rvs: callable, optional @@ -115,12 +131,18 @@ def ecdf_confidence_band( if method == "pointwise": prob_pointwise = prob + elif method == "optimized": + prob_pointwise = _optimize_simultaneous_ecdf_band_probability( + ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs + ) elif method == "simulated": prob_pointwise = _simulate_simultaneous_ecdf_band_probability( ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs ) else: - raise ValueError(f"Unknown method {method}. Valid options are 'pointwise' or 'simulated'.") + raise ValueError( + f"Unknown method {method}. Valid options are 'pointwise', 'optimized', or 'simulated'." + ) prob_lower, prob_upper = _get_pointwise_confidence_band( prob_pointwise, ndraws, cdf_at_eval_points @@ -129,6 +151,139 @@ def ecdf_confidence_band( return prob_lower, prob_upper +def _update_ecdf_band_interior_probabilities( + prob_left: np.ndarray, + interval_left: np.ndarray, + interval_right: np.ndarray, + p: float, + ndraws: int, +) -> np.ndarray: + """Update the probability that an ECDF has been within the envelope including at the current + point. + + Arguments + --------- + prob_left : np.ndarray + For each point in the interior at the previous point, the joint probability that it and all + points before are in the interior. + interval_left : np.ndarray + The set of points in the interior at the previous point. + interval_right : np.ndarray + The set of points in the interior at the current point. + p : float + The probability of any given point found between the previous point and the current one. + ndraws : int + Number of draws in the original dataset. + + Returns + ------- + prob_right : np.ndarray + For each point in the interior at the current point, the joint probability that it and all + previous points are in the interior. + """ + interval_left = interval_left[:, np.newaxis] + prob_conditional = binom.pmf(interval_right, ndraws - interval_left, p, loc=interval_left) + prob_right = prob_left.dot(prob_conditional) + return prob_right + + +@vectorize(["float64(int64, int64, float64, int64)"]) +def _binom_pmf(k, n, p, loc): + k -= loc + if k < 0 or k > n: + return 0.0 + if p == 0: + return 1.0 if k == 0 else 0.0 + if p == 1: + return 1.0 if k == n else 0.0 + if k == 0: + return (1 - p) ** n + if k == n: + return p**n + lbinom = math.lgamma(n + 1) - math.lgamma(k + 1) - math.lgamma(n - k + 1) + return np.exp(lbinom + k * np.log(p) + (n - k) * np.log1p(-p)) + + +@jit(nopython=True) +def _update_ecdf_band_interior_probabilities_numba( + prob_left: np.ndarray, + interval_left: np.ndarray, + interval_right: np.ndarray, + p: float, + ndraws: int, +) -> np.ndarray: + interval_left = interval_left[:, np.newaxis] + prob_conditional = _binom_pmf(interval_right, ndraws - interval_left, p, interval_left) + prob_right = prob_left.dot(prob_conditional) + return prob_right + + +def _ecdf_band_interior_probability(prob_between_points, ndraws, lower_count, upper_count): + interval_left = np.arange(1) + prob_interior = np.ones(1) + for i in range(prob_between_points.shape[0]): + interval_right = np.arange(lower_count[i], upper_count[i]) + prob_interior = _update_ecdf_band_interior_probabilities( + prob_interior, interval_left, interval_right, prob_between_points[i], ndraws + ) + interval_left = interval_right + return prob_interior.sum() + + +@jit(nopython=True) +def _ecdf_band_interior_probability_numba(prob_between_points, ndraws, lower_count, upper_count): + interval_left = np.arange(1) + prob_interior = np.ones(1) + for i in range(prob_between_points.shape[0]): + interval_right = np.arange(lower_count[i], upper_count[i]) + prob_interior = _update_ecdf_band_interior_probabilities_numba( + prob_interior, interval_left, interval_right, prob_between_points[i], ndraws + ) + interval_left = interval_right + return prob_interior.sum() + + +def _ecdf_band_optimization_objective( + prob_pointwise: float, + cdf_at_eval_points: np.ndarray, + ndraws: int, + prob_target: float, +) -> float: + """Objective function for optimizing the simultaneous confidence band probability.""" + lower, upper = _get_pointwise_confidence_band(prob_pointwise, ndraws, cdf_at_eval_points) + lower_count = (lower * ndraws).astype(int) + upper_count = (upper * ndraws).astype(int) + 1 + cdf_with_zero = np.insert(cdf_at_eval_points[:-1], 0, 0) + prob_between_points = (cdf_at_eval_points - cdf_with_zero) / (1 - cdf_with_zero) + if Numba.numba_flag: + prob_interior = _ecdf_band_interior_probability_numba( + prob_between_points, ndraws, lower_count, upper_count + ) + else: + prob_interior = _ecdf_band_interior_probability( + prob_between_points, ndraws, lower_count, upper_count + ) + return abs(prob_interior - prob_target) + + +def _optimize_simultaneous_ecdf_band_probability( + ndraws: int, + eval_points: np.ndarray, # pylint: disable=unused-argument + cdf_at_eval_points: np.ndarray, + prob: float = 0.95, + **kwargs, # pylint: disable=unused-argument +): + """Estimate probability for simultaneous confidence band using optimization. + + This function simulates the pointwise probability needed to construct pointwise confidence bands + that form a `prob`-level confidence envelope for the ECDF of a sample. + """ + cdf_at_eval_points = np.unique(cdf_at_eval_points) + objective = lambda p: _ecdf_band_optimization_objective(p, cdf_at_eval_points, ndraws, prob) + prob_pointwise = minimize_scalar(objective, bounds=(prob, 1), method="bounded").x + return prob_pointwise + + def _simulate_simultaneous_ecdf_band_probability( ndraws: int, eval_points: np.ndarray, diff --git a/arviz/tests/base_tests/test_plots_matplotlib.py b/arviz/tests/base_tests/test_plots_matplotlib.py index 1183812db4..5afcbe1669 100644 --- a/arviz/tests/base_tests/test_plots_matplotlib.py +++ b/arviz/tests/base_tests/test_plots_matplotlib.py @@ -1285,10 +1285,11 @@ def test_plot_ecdf_eval_points(): assert axes is not None -@pytest.mark.parametrize("confidence_bands", [True, "pointwise", "simulated"]) -def test_plot_ecdf_confidence_bands(confidence_bands): +@pytest.mark.parametrize("confidence_bands", [True, "pointwise", "optimized", "simulated"]) +@pytest.mark.parametrize("ndraws", [100, 10_000]) +def test_plot_ecdf_confidence_bands(confidence_bands, ndraws): """Check that all confidence_bands values correctly accepted""" - data = np.random.randn(4, 1000) + data = np.random.randn(4, ndraws // 4) axes = plot_ecdf(data, confidence_bands=confidence_bands, cdf=norm(0, 1).cdf) assert axes is not None @@ -1326,6 +1327,8 @@ def test_plot_ecdf_error(): # contradictory confidence band types with pytest.raises(ValueError): plot_ecdf(data, cdf=dist.cdf, confidence_bands="simulated", pointwise=True) + with pytest.raises(ValueError): + plot_ecdf(data, cdf=dist.cdf, confidence_bands="optimized", pointwise=True) plot_ecdf(data, cdf=dist.cdf, confidence_bands=True, pointwise=True) plot_ecdf(data, cdf=dist.cdf, confidence_bands="pointwise") diff --git a/arviz/tests/base_tests/test_stats_ecdf_utils.py b/arviz/tests/base_tests/test_stats_ecdf_utils.py index b6d5563464..68d1e98189 100644 --- a/arviz/tests/base_tests/test_stats_ecdf_utils.py +++ b/arviz/tests/base_tests/test_stats_ecdf_utils.py @@ -10,6 +10,13 @@ _get_pointwise_confidence_band, ) +try: + import numba # pylint: disable=unused-import + + numba_options = [True, False] +except ImportError: + numba_options = [False] + def test_compute_ecdf(): """Test compute_ecdf function.""" @@ -109,9 +116,15 @@ def test_get_pointwise_confidence_band(dist, prob, ndraws, num_trials=1_000, see ids=["continuous", "continuous default rvs", "discrete"], ) @pytest.mark.parametrize("ndraws", [10_000]) -@pytest.mark.parametrize("method", ["pointwise", "simulated"]) -def test_ecdf_confidence_band(dist, rvs, prob, ndraws, method, num_trials=1_000, seed=57): +@pytest.mark.parametrize("method", ["pointwise", "optimized", "simulated"]) +@pytest.mark.parametrize("use_numba", numba_options) +def test_ecdf_confidence_band( + dist, rvs, prob, ndraws, method, use_numba, num_trials=1_000, seed=57 +): """Test test_ecdf_confidence_band.""" + if use_numba and method != "optimized": + pytest.skip("Numba only used in optimized method") + eval_points = np.linspace(*dist.interval(0.99), 10) cdf_at_eval_points = dist.cdf(eval_points) random_state = np.random.default_rng(seed)