Skip to content

Commit

Permalink
Anova rm addition (#164)
Browse files Browse the repository at this point in the history
  • Loading branch information
sterrettJD authored Dec 12, 2023
1 parent d9c4d8d commit d55ee50
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 29 deletions.
78 changes: 56 additions & 22 deletions q2_longitudinal/_longitudinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# ----------------------------------------------------------------------------

import os.path

import pkg_resources
from distutils.dir_util import copy_tree

Expand All @@ -18,6 +19,8 @@
import biom
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import AnovaRM


from ._utilities import (_get_group_pairs, _extract_distance_distribution,
_visualize, _validate_metadata_is_superset,
Expand Down Expand Up @@ -242,11 +245,20 @@ def linear_mixed_effects(output_dir: str, metadata: qiime2.Metadata,
def anova(output_dir: str,
metadata: qiime2.Metadata,
formula: str,
sstype: str = 'II') -> None:
sstype: str = 'II',
repeated_measures: bool = False,
individual_id_column: str = None,
rm_aggregate: bool = False) -> None:

# Grab metric and covariate names from formula
metric, group_columns = _parse_formula(formula)
columns = [metric] + list(group_columns)
# Add individual id col if performing repeated measures
if repeated_measures is True:
if individual_id_column is None or individual_id_column == "":
raise ValueError('individual ID column was '
'not provided for repeated measures')
columns = columns + [individual_id_column]

# Validate formula (columns are in metadata, etc)
for col in columns:
Expand All @@ -256,31 +268,53 @@ def anova(output_dir: str,
metadata = metadata.to_dataframe()[columns].dropna()

# Run anova
lm = ols(formula, metadata).fit()
results = pd.DataFrame(sm.stats.anova_lm(lm, typ=sstype)).fillna('')
results.to_csv(os.path.join(output_dir, 'anova.tsv'), sep='\t')

# Run pairwise t-tests with multiple test correction
pairwise_tests = pd.DataFrame()
for group in group_columns:
# only run on categorical columns — numeric columns raise error
if group in cats:
ttests = lm.t_test_pairwise(group, method='fdr_bh').result_frame
pairwise_tests = pd.concat([pairwise_tests, pd.DataFrame(ttests)])
if pairwise_tests.empty:
pairwise_tests = False

# Plot fit vs. residuals
metadata['residual'] = lm.resid
metadata['fitted_values'] = lm.fittedvalues
res = _regplot_subplots_from_dataframe(
'fitted_values', 'residual', metadata, group_columns, lowess=False,
ci=95, palette='Set1', fit_reg=False)
if repeated_measures is False:
lm = ols(formula, metadata).fit()
results = pd.DataFrame(sm.stats.anova_lm(lm, typ=sstype)).fillna('')
results.to_csv(os.path.join(output_dir, 'anova.tsv'), sep='\t')

# Run pairwise t-tests with multiple test correction
pairwise_tests = pd.DataFrame()
for group in group_columns:
# only run on categorical columns — numeric columns raise error
if group in cats:
ttests = lm.t_test_pairwise(group,
method='fdr_bh').result_frame
pairwise_tests = pd.concat([pairwise_tests,
pd.DataFrame(ttests)])
if pairwise_tests.empty:
pairwise_tests = False

# Plot fit vs. residuals
metadata['residual'] = lm.resid
metadata['fitted_values'] = lm.fittedvalues
res = _regplot_subplots_from_dataframe(
'fitted_values', 'residual', metadata, group_columns, lowess=False,
ci=95, palette='Set1', fit_reg=False)

pairwise_test_name = 'Pairwise t-tests'

elif repeated_measures is True:
# create mapper for aggregate function (required arg for AnovaRM)
agg_dict = {True: "mean", False: None}

# run anova
aov = AnovaRM(depvar=metric,
within=list(group_columns),
subject=individual_id_column,
data=metadata,
aggregate_func=agg_dict[rm_aggregate]).fit()

results = aov.anova_table
results.to_csv(os.path.join(output_dir, 'anova.tsv'), sep='\t')

# set these for the visualize func
pairwise_tests, res, pairwise_test_name = False, False, False

# Visualize results
_visualize_anova(output_dir, pairwise_tests=pairwise_tests,
model_results=results, residuals=res,
pairwise_test_name='Pairwise t-tests')
pairwise_test_name=pairwise_test_name)


def _warn_column_name_exists(column_name):
Expand Down
11 changes: 6 additions & 5 deletions q2_longitudinal/_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,11 +546,12 @@ def _visualize_anova(output_dir, pairwise_tests=False, model_results=False,
sep='\t')
model_results = q2templates.df_to_html(model_results)

residuals.savefig(
os.path.join(output_dir, 'residuals.png'), bbox_inches='tight')
residuals.savefig(
os.path.join(output_dir, 'residuals.pdf'), bbox_inches='tight')
plt.close('all')
if residuals is not False:
residuals.savefig(
os.path.join(output_dir, 'residuals.png'), bbox_inches='tight')
residuals.savefig(
os.path.join(output_dir, 'residuals.pdf'), bbox_inches='tight')
plt.close('all')

index = os.path.join(TEMPLATES, 'index.html')
q2templates.render(index, output_dir, context={
Expand Down
7 changes: 7 additions & 0 deletions q2_longitudinal/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,10 @@ @article {Bokulich306167
number={30},
pages={934}
}

@book{rutherford2011anova,
title={ANOVA and ANCOVA: a GLM approach},
author={Rutherford, Andrew},
year={2011},
publisher={John Wiley \& Sons}
}
23 changes: 21 additions & 2 deletions q2_longitudinal/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,10 @@
inputs={},
parameters={'metadata': Metadata,
'formula': Str,
'sstype': Str % Choices(['I', 'II', 'III'])},
'sstype': Str % Choices(['I', 'II', 'III']),
'repeated_measures': Bool,
'individual_id_column': Str,
'rm_aggregate': Bool},
input_descriptions={},
parameter_descriptions={
'metadata': 'Sample metadata containing formula terms.',
Expand All @@ -267,7 +270,23 @@
'artifacts and can be continuous or categorical metadata '
'columns. ' + formula_description,
'sstype': (
'Type of sum of squares calculation to perform (I, II, or III).')
'Type of sum of squares calculation to perform (I, II, or III).'),
'repeated_measures': 'Perform ANOVA as a repeated measures ANOVA. '
'Implemented via statsmodels, which has the '
'following limitations: Currently, only '
'fully balanced within-subject designs are '
'supported. Calculation of between-subject '
'effects and corrections for violation of '
'sphericity are not yet implemented.',
'individual_id_column': 'The column containing individual ID with '
'repeated measures to account for.'
'This should not be included in the formula.',
'rm_aggregate': 'If the data set contains more than a single '
'observation per individual id and cell of the '
'specified model, this function will be used to '
'aggregate the data by the mean before running '
'the ANOVA. '
'Only applicable for repeated measures ANOVA. '
},
name='ANOVA test',
description=('Perform an ANOVA test on any factors present in a metadata '
Expand Down
50 changes: 50 additions & 0 deletions q2_longitudinal/tests/test_longitudinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import pandas as pd
import pandas.testing as pdt
import skbio
import statsmodels.api as sm
import qiime2
from qiime2.plugin.testing import TestPluginBase
from qiime2.plugins import longitudinal
Expand Down Expand Up @@ -1050,6 +1051,17 @@ def setUp(self):
['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8',
's9', 's10', 's11', 's12'], name='id')))

# load longitudinal data for repeated measures
# This is from statsmodels example use of AnovaRM
self.pig_data = sm.datasets.get_rdataset("dietox", "geepack").data
self.pig_data["Early_Life"] = self.pig_data[
"Time"].apply(lambda x: 1 if x <= 6 else 0)
self.pig_data["Pig"] = self.pig_data["Pig"].astype(str)
self.pig_data.index = [str(x) for x in self.pig_data.index]
self.pig_data.index.name = "sampleid"

self.pig_data = qiime2.Metadata(self.pig_data)

def test_execution(self):
exp = pd.DataFrame(
[['letter', 33.3333333333, 1.0, 27.2727272727, 0.0005474948912],
Expand Down Expand Up @@ -1104,6 +1116,44 @@ def test_missing_tilde(self):
with self.assertRaisesRegex(ValueError, "missing tilde"):
anova(temp_dir_name, self.md, 'letter')

def test_repeated_measures_anova(self):
anova(self.temp_dir.name,
metadata=self.pig_data,
formula='Weight ~ Early_Life',
repeated_measures=True,
individual_id_column='Pig',
rm_aggregate=True)

exp = pd.DataFrame([["Early_Life", 6810.96834, 1.0,
71.0, 2.869744e-72]],
columns=["Unnamed: 0", "F Value", "Num DF",
"Den DF", "Pr > F"],
index=[0])

obs = pd.read_csv(
os.path.join(self.temp_dir.name, 'anova.tsv'), sep='\t')

pdt.assert_frame_equal(obs, exp)

def test_repeated_measures_raises_error_no_id_column(self):
with self.assertRaisesRegex(ValueError, "individual ID column "
"was not provided for "
"repeated measures"):
anova(output_dir=self.temp_dir.name,
metadata=self.pig_data,
formula='Weight ~ Early_Life',
repeated_measures=True,
individual_id_column=None)

with self.assertRaisesRegex(ValueError, "individual ID column "
"was not provided for "
"repeated measures"):
anova(output_dir=self.temp_dir.name,
metadata=self.pig_data,
formula='Weight ~ Early_Life',
repeated_measures=True,
individual_id_column="")


md = pd.DataFrame([(1, 'a', 0.11, 1), (1, 'a', 0.12, 2), (1, 'a', 0.13, 3),
(2, 'a', 0.19, 1), (2, 'a', 0.18, 2), (2, 'a', 0.21, 3),
Expand Down

0 comments on commit d55ee50

Please sign in to comment.