Anova rm addition (#164)

qiime2 · Dec 12, 2023 · d55ee50 · d55ee50
1 parent d9c4d8d
commit d55ee50
Show file tree

Hide file tree

Showing 5 changed files with 140 additions and 29 deletions.
diff --git a/q2_longitudinal/_longitudinal.py b/q2_longitudinal/_longitudinal.py
@@ -7,6 +7,7 @@
 # ----------------------------------------------------------------------------
 
 import os.path
+
 import pkg_resources
 from distutils.dir_util import copy_tree
 
@@ -18,6 +19,8 @@
 import biom
 import statsmodels.api as sm
 from statsmodels.formula.api import ols
+from statsmodels.stats.anova import AnovaRM
+
 
 from ._utilities import (_get_group_pairs, _extract_distance_distribution,
                          _visualize, _validate_metadata_is_superset,
@@ -242,11 +245,20 @@ def linear_mixed_effects(output_dir: str, metadata: qiime2.Metadata,
 def anova(output_dir: str,
           metadata: qiime2.Metadata,
           formula: str,
-          sstype: str = 'II') -> None:
+          sstype: str = 'II',
+          repeated_measures: bool = False,
+          individual_id_column: str = None,
+          rm_aggregate: bool = False) -> None:
 
     # Grab metric and covariate names from formula
     metric, group_columns = _parse_formula(formula)
     columns = [metric] + list(group_columns)
+    # Add individual id col if performing repeated measures
+    if repeated_measures is True:
+        if individual_id_column is None or individual_id_column == "":
+            raise ValueError('individual ID column was '
+                             'not provided for repeated measures')
+        columns = columns + [individual_id_column]
 
     # Validate formula (columns are in metadata, etc)
     for col in columns:
@@ -256,31 +268,53 @@ def anova(output_dir: str,
     metadata = metadata.to_dataframe()[columns].dropna()
 
     # Run anova
-    lm = ols(formula, metadata).fit()
-    results = pd.DataFrame(sm.stats.anova_lm(lm, typ=sstype)).fillna('')
-    results.to_csv(os.path.join(output_dir, 'anova.tsv'), sep='\t')
-
-    # Run pairwise t-tests with multiple test correction
-    pairwise_tests = pd.DataFrame()
-    for group in group_columns:
-        # only run on categorical columns — numeric columns raise error
-        if group in cats:
-            ttests = lm.t_test_pairwise(group, method='fdr_bh').result_frame
-            pairwise_tests = pd.concat([pairwise_tests, pd.DataFrame(ttests)])
-    if pairwise_tests.empty:
-        pairwise_tests = False
-
-    # Plot fit vs. residuals
-    metadata['residual'] = lm.resid
-    metadata['fitted_values'] = lm.fittedvalues
-    res = _regplot_subplots_from_dataframe(
-        'fitted_values', 'residual', metadata, group_columns, lowess=False,
-        ci=95, palette='Set1', fit_reg=False)
+    if repeated_measures is False:
+        lm = ols(formula, metadata).fit()
+        results = pd.DataFrame(sm.stats.anova_lm(lm, typ=sstype)).fillna('')
+        results.to_csv(os.path.join(output_dir, 'anova.tsv'), sep='\t')
+
+        # Run pairwise t-tests with multiple test correction
+        pairwise_tests = pd.DataFrame()
+        for group in group_columns:
+            # only run on categorical columns — numeric columns raise error
+            if group in cats:
+                ttests = lm.t_test_pairwise(group,
+                                            method='fdr_bh').result_frame
+                pairwise_tests = pd.concat([pairwise_tests,
+                                            pd.DataFrame(ttests)])
+        if pairwise_tests.empty:
+            pairwise_tests = False
+
+        # Plot fit vs. residuals
+        metadata['residual'] = lm.resid
+        metadata['fitted_values'] = lm.fittedvalues
+        res = _regplot_subplots_from_dataframe(
+            'fitted_values', 'residual', metadata, group_columns, lowess=False,
+            ci=95, palette='Set1', fit_reg=False)
+
+        pairwise_test_name = 'Pairwise t-tests'
+
+    elif repeated_measures is True:
+        # create mapper for aggregate function (required arg for AnovaRM)
+        agg_dict = {True: "mean", False: None}
+
+        # run anova
+        aov = AnovaRM(depvar=metric,
+                      within=list(group_columns),
+                      subject=individual_id_column,
+                      data=metadata,
+                      aggregate_func=agg_dict[rm_aggregate]).fit()
+
+        results = aov.anova_table
+        results.to_csv(os.path.join(output_dir, 'anova.tsv'), sep='\t')
+
+        # set these for the visualize func
+        pairwise_tests, res, pairwise_test_name = False, False, False
 
     # Visualize results
     _visualize_anova(output_dir, pairwise_tests=pairwise_tests,
                      model_results=results, residuals=res,
-                     pairwise_test_name='Pairwise t-tests')
+                     pairwise_test_name=pairwise_test_name)
 
 
 def _warn_column_name_exists(column_name):

diff --git a/q2_longitudinal/_utilities.py b/q2_longitudinal/_utilities.py
@@ -546,11 +546,12 @@ def _visualize_anova(output_dir, pairwise_tests=False, model_results=False,
                          sep='\t')
     model_results = q2templates.df_to_html(model_results)
 
-    residuals.savefig(
-        os.path.join(output_dir, 'residuals.png'), bbox_inches='tight')
-    residuals.savefig(
-        os.path.join(output_dir, 'residuals.pdf'), bbox_inches='tight')
-    plt.close('all')
+    if residuals is not False:
+        residuals.savefig(
+            os.path.join(output_dir, 'residuals.png'), bbox_inches='tight')
+        residuals.savefig(
+            os.path.join(output_dir, 'residuals.pdf'), bbox_inches='tight')
+        plt.close('all')
 
     index = os.path.join(TEMPLATES, 'index.html')
     q2templates.render(index, output_dir, context={

diff --git a/q2_longitudinal/citations.bib b/q2_longitudinal/citations.bib
@@ -53,3 +53,10 @@ @article {Bokulich306167
   number={30},
   pages={934}
 }
+
+@book{rutherford2011anova,
+  title={ANOVA and ANCOVA: a GLM approach},
+  author={Rutherford, Andrew},
+  year={2011},
+  publisher={John Wiley \& Sons}
+}
diff --git a/q2_longitudinal/plugin_setup.py b/q2_longitudinal/plugin_setup.py
@@ -258,7 +258,10 @@
     inputs={},
     parameters={'metadata': Metadata,
                 'formula': Str,
-                'sstype': Str % Choices(['I', 'II', 'III'])},
+                'sstype': Str % Choices(['I', 'II', 'III']),
+                'repeated_measures': Bool,
+                'individual_id_column': Str,
+                'rm_aggregate': Bool},
     input_descriptions={},
     parameter_descriptions={
         'metadata': 'Sample metadata containing formula terms.',
@@ -267,7 +270,23 @@
                    'artifacts and can be continuous or categorical metadata '
                    'columns. ' + formula_description,
         'sstype': (
-            'Type of sum of squares calculation to perform (I, II, or III).')
+            'Type of sum of squares calculation to perform (I, II, or III).'),
+        'repeated_measures': 'Perform ANOVA as a repeated measures ANOVA. '
+                             'Implemented via statsmodels, which has the '
+                             'following limitations: Currently, only '
+                             'fully balanced within-subject designs are '
+                             'supported. Calculation of between-subject '
+                             'effects and corrections for violation of '
+                             'sphericity are not yet implemented.',
+        'individual_id_column': 'The column containing individual ID with '
+                                'repeated measures to account for.'
+                                'This should not be included in the formula.',
+        'rm_aggregate': 'If the data set contains more than a single '
+                        'observation per individual id and cell of the '
+                        'specified model, this function will be used to '
+                        'aggregate the data by the mean before running '
+                        'the ANOVA. '
+                        'Only applicable for repeated measures ANOVA. '
     },
     name='ANOVA test',
     description=('Perform an ANOVA test on any factors present in a metadata '

diff --git a/q2_longitudinal/tests/test_longitudinal.py b/q2_longitudinal/tests/test_longitudinal.py
@@ -17,6 +17,7 @@
 import pandas as pd
 import pandas.testing as pdt
 import skbio
+import statsmodels.api as sm
 import qiime2
 from qiime2.plugin.testing import TestPluginBase
 from qiime2.plugins import longitudinal
@@ -1050,6 +1051,17 @@ def setUp(self):
                             ['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8',
                              's9', 's10', 's11', 's12'], name='id')))
 
+        # load longitudinal data for repeated measures
+        # This is from statsmodels example use of AnovaRM
+        self.pig_data = sm.datasets.get_rdataset("dietox", "geepack").data
+        self.pig_data["Early_Life"] = self.pig_data[
+            "Time"].apply(lambda x: 1 if x <= 6 else 0)
+        self.pig_data["Pig"] = self.pig_data["Pig"].astype(str)
+        self.pig_data.index = [str(x) for x in self.pig_data.index]
+        self.pig_data.index.name = "sampleid"
+
+        self.pig_data = qiime2.Metadata(self.pig_data)
+
     def test_execution(self):
         exp = pd.DataFrame(
             [['letter', 33.3333333333, 1.0, 27.2727272727, 0.0005474948912],
@@ -1104,6 +1116,44 @@ def test_missing_tilde(self):
             with self.assertRaisesRegex(ValueError, "missing tilde"):
                 anova(temp_dir_name, self.md, 'letter')
 
+    def test_repeated_measures_anova(self):
+        anova(self.temp_dir.name,
+              metadata=self.pig_data,
+              formula='Weight ~ Early_Life',
+              repeated_measures=True,
+              individual_id_column='Pig',
+              rm_aggregate=True)
+
+        exp = pd.DataFrame([["Early_Life", 6810.96834, 1.0,
+                             71.0, 2.869744e-72]],
+                           columns=["Unnamed: 0", "F Value", "Num DF",
+                                    "Den DF", "Pr > F"],
+                           index=[0])
+
+        obs = pd.read_csv(
+            os.path.join(self.temp_dir.name, 'anova.tsv'), sep='\t')
+
+        pdt.assert_frame_equal(obs, exp)
+
+    def test_repeated_measures_raises_error_no_id_column(self):
+        with self.assertRaisesRegex(ValueError, "individual ID column "
+                                                "was not provided for "
+                                                "repeated measures"):
+            anova(output_dir=self.temp_dir.name,
+                  metadata=self.pig_data,
+                  formula='Weight ~ Early_Life',
+                  repeated_measures=True,
+                  individual_id_column=None)
+
+        with self.assertRaisesRegex(ValueError, "individual ID column "
+                                                "was not provided for "
+                                                "repeated measures"):
+            anova(output_dir=self.temp_dir.name,
+                  metadata=self.pig_data,
+                  formula='Weight ~ Early_Life',
+                  repeated_measures=True,
+                  individual_id_column="")
+
 
 md = pd.DataFrame([(1, 'a', 0.11, 1), (1, 'a', 0.12, 2), (1, 'a', 0.13, 3),
                    (2, 'a', 0.19, 1), (2, 'a', 0.18, 2), (2, 'a', 0.21, 3),