raphaelvallat · ytzfhqs · Aug 14, 2023 · Sep 6, 2024 · qbarthelemy · Sep 17, 2024
diff --git a/pingouin/contingency.py b/pingouin/contingency.py
@@ -5,11 +5,11 @@
 
 from scipy.stats.contingency import expected_freq
 from scipy.stats import power_divergence, binom, chi2 as sp_chi2
-
+from scipy.stats import barnard_exact, boschloo_exact, fisher_exact
 from pingouin import power_chi2, _postprocess_dataframe
 
 
-__all__ = ["chi2_independence", "chi2_mcnemar", "dichotomous_crosstab"]
+__all__ = ["chi2_independence", "chi2_mcnemar", "chi2_exact","dichotomous_crosstab"]
 
 
 ###############################################################################
@@ -345,6 +345,168 @@ def chi2_mcnemar(data, x, y, correction=True):
     return observed, _postprocess_dataframe(stats)
 
 
+def chi2_exact(data, x, y, method='fisher', **kwargs):
+    """
+    Perform a exact test on a 2x2 contingency table.
+
+    Parameters
+    ----------
+    data : :py:class:`pandas.DataFrame`
+        The dataframe containing the ocurrences for the test.
+    x, y : string
+        The variables names for the exact test. Must be names of columns
+        in ``data``.
+    method : string
+        Methods of exact test. Options include``fisher``,``barnard``,``boschloo``.
-    method : string
-        Methods of exact test. Options include``fisher``,``barnard``,``boschloo``.
+    method : {"fisher", "barnard", "boschloo"}, default="fisher"
+        Method of exact test.
-    method : string
-        Methods of exact test. Options include``fisher``,``barnard``,``boschloo``.
+    method : {"fisher", "barnard", "boschloo"}, default="fisher"
+        Method of exact test.
+
+    Returns
+    -------
+    expected : :py:class:`pandas.DataFrame`
+        The expected contingency table of frequencies.
+    observed : :py:class:`pandas.DataFrame`
+        The (corrected or not) observed contingency table of frequencies.
+    stats : :py:class:`pandas.DataFrame`
+        The test summary, containing four columns:
+
+        * ``'alternative'``: Defines the alternative hypothesis
+        * ``'odds ratio'``: The prior odds ratio
+        * ``'pval'``: The p-value of the test
+
+    Notes
+    -----
+    *The null hypothesis is that the true odds ratio of the populations underlying
+    the observations is one, and the observations were sampled from these populations
+    under a condition: the marginals of the resulting table must equal those of the 
+    observed table. The statistic returned is the unconditional maximum likelihood 
+    estimate of the odds ratio, and the p-value is the probability under the null 
+    hypothesis of obtaining a table at least as extreme as the one that was actually 
+    observed. There are other possible choices of statistic and two-sided p-value 
+    definition associated with Fisher’s exact test;*
+
+    *Barnard’s test is an exact test used in the analysis of contingency tables. 
+    It examines the association of two categorical variables, and is a more powerful
+    alternative than Fisher’s exact test for 2x2 contingency tables.*
+
+    *Boschloo’s test is an exact test used in the analysis of contingency tables.
+    It examines the association of two categorical variables, and is a uniformly 
+    more powerful alternative to Fisher’s exact test for 2x2 contingency tables.*
+
+    References
+    ----------
+    * Fisher, Sir Ronald A, “The Design of Experiments: Mathematics of
+      a Lady Tasting Tea.” ISBN 978-0-486-41151-4, 1935.
+
+    * “Fisher’s exact test”, https://en.wikipedia.org/wiki/Fisher’s_exact_test
+
+    * Emma V. Low et al. “Identifying the lowest effective dose of acetazolamide
+      for the prophylaxis of acute mountain sickness: systematic review and meta-analysis
+      .” BMJ, 345, DOI:10.1136/bmj.e6779, 2012.
+
+    * Barnard, G. A. “Significance Tests for 2x2 Tables”. Biometrika. 34.1/2 (1947):
+      123-138. DOI:dpgkg3
+
+    * Mehta, Cyrus R., and Pralay Senchaudhuri. “Conditional versus unconditional
+      exact tests for comparing two binomials.” Cytel Software Corporation 675 (2003): 1-5.
+
+    * “Wald Test”. Wikipedia. https://en.wikipedia.org/wiki/Wald_test
+
+    * R.D. Boschloo. “Raised conditional level of significance for the 2 x 2-table
+      when testing the equality of two probabilities”, Statistica Neerlandica, 24(1), 1970
+
+    * “Boschloo’s test”, Wikipedia, https://en.wikipedia.org/wiki/Boschloo%27s_test
+
+    * Lise M. Saari et al. “Employee attitudes and job satisfaction”, Human Resource
+      Management, 43(4), 395-407, 2004, DOI:10.1002/hrm.20032.
+
+    Examples
+    --------
+    Let's see if gender is a good categorical predictor for the presence of
+    heart disease.
+
+    >>> import pingouin as pg
+    >>> data = pg.read_dataset('chi2_independence')
+    >>> data['sex'].value_counts(ascending=True)
+    0     96
+    1    207
+    Name: sex, dtype: int64
+
+    If gender is not a good predictor for heart disease, we should expect the
+    same 96:207 ratio across the target classes.
+
+    >>> expected, observed, stats = pg.chi2_exact(data, x='sex', y='target', method='fisher')
+    >>> expected
+    target          0           1
+    sex
+    0       43.722772   52.277228
+    1       94.277228  112.722772
+
+    Let's see what the data tells us.
+
+    >>> observed
+    target      0     1
+    sex
+    0        24.5  71.5
+    1       113.5  93.5
+
+    The proportion is lower on the class 0 and higher on the class 1. The
+    tests should be sensitive to this difference.
+
+    >>> stats.round(3)
+              alternative  odds ratio    pval
+    0           two-sided       0.272     0.0
+    1                less       0.272     0.0
+    2             greater       0.272     1.0
+
+    The p-value is very small when the alternative is two-sided and less. 
+    suggests that there is a significant relationship between gender and 
+    heart disease, and that women are less likely to suffer from heart disease
+    than men.
+    """
+    # Python code inspired by SciPy's fisher_exact,barnard_exact,boschloo_exact
+    assert isinstance(data, pd.DataFrame), "data must be a pandas DataFrame."
+    assert isinstance(x, (str, int)), "x must be a string or int."
+    assert isinstance(y, (str, int)), "y must be a string or int."
+    assert all(col in data.columns for col in (x, y)), "columns are not in dataframe."
+    assert isinstance(method, str), "method must be a string and must be one of 'fisher', 'barnard', 'boschloo'."
+
+    observed = pd.crosstab(data[x], data[y])
+
+    if observed.size == 0:
+        raise ValueError("No data; observed has size 0.")
+
+    expected = pd.DataFrame(expected_freq(observed), index=observed.index, columns=observed.columns)
+
+    stats = []
+    alternatives = [
+        "two-sided",
+        "less",
+        "greater",
+    ]
+
+    if method == "fisher":
+        func = fisher_exact
+    elif method == "barnard":
+        func = barnard_exact
+    elif method == "boschloo":
+        func = boschloo_exact
+    else:
+        raise ValueError(f"Unsupported method {method}")
+
+    for alternative in alternatives:
+        res = func(table=observed, alternative=alternative, **kwargs)
+        stats.append(
+            {
+                "alternative": alternative,
+                "odds ratio": res.statistic,
+                "pval": res.pvalue,
+            }
+        )
+
+    stats = pd.DataFrame(stats)[["alternative", "odds ratio", "pval"]]
+
+    return expected, observed, _postprocess_dataframe(stats)
+
+
 ###############################################################################
 # DICHOTOMOUS CONTINGENCY TABLES
 ###############################################################################