Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Pandas Groupby Functionality to Stats Functions #86

Merged
merged 13 commits into from
Sep 8, 2023
10 changes: 6 additions & 4 deletions thicket/stats/calc_boxplot_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ def calc_boxplot_statistics(thicket, columns=[], quartiles=[0.25, 0.5, 0.75], **
col + "_outliers" + q_list: [],
}

for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()):
values = thicket.dataframe.loc[node][col].tolist()
df = thicket.dataframe.reset_index().groupby("node")
for node, item in df:
values = df.get_group(node)[col].tolist()

q = np.quantile(values, quartiles)
q1 = q[0]
Expand Down Expand Up @@ -107,8 +108,9 @@ def calc_boxplot_statistics(thicket, columns=[], quartiles=[0.25, 0.5, 0.75], **
}
}

for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()):
values = thicket.dataframe.loc[node][(idx, col)].tolist()
df = thicket.dataframe.reset_index().groupby("node")
for node, item in df:
values = df.get_group(node)[(idx, col)].tolist()

q = np.quantile(values, quartiles)
q1 = q[0]
Expand Down
62 changes: 40 additions & 22 deletions thicket/stats/check_normality.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,46 +35,64 @@ def check_normality(thicket, columns=None):

# thicket object without columnar index
if thicket.dataframe.columns.nlevels == 1:
df = (
thicket.dataframe.select_dtypes(include="number")
.reset_index()
.groupby("node")
.agg(stats.shapiro)
)
for column in columns:
normality = []
for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()):
pvalue = stats.shapiro(thicket.dataframe.loc[node][column])[1]
for i in range(0, len(df[column])):
pvalue = df[column][i].pvalue

if pvalue < 0.05:
normality.append("False")
thicket.statsframe.dataframe.loc[
df.index[i], column + "_normality"
] = "False"
elif pvalue > 0.05:
normality.append("True")
thicket.statsframe.dataframe.loc[
df.index[i], column + "_normality"
] = "True"
else:
normality.append(pd.NA)
# check to see if exclusive metric
if column in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append(column + "_normality")
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append(column + "_normality")

thicket.statsframe.dataframe[column + "_normality"] = normality
thicket.stataframe.dataframe.loc[
df.index[i], column + "_normality"
] = pd.NA
# check to see if exclusive metric
if column in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append(column + "_normality")
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append(column + "_normality")
# columnar joined thicket object
else:
df = (
thicket.dataframe.select_dtypes(include="number")
.reset_index(level=1)
.groupby("node")
.agg(stats.shapiro)
)
for idx, column in columns:
normality = []
for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()):
pvalue = stats.shapiro(thicket.dataframe.loc[node][(idx, column)])[1]
for i in range(0, len(df[(idx, column)])):
pvalue = df[(idx, column)][i].pvalue

if pvalue < 0.05:
normality.append("False")
thicket.statsframe.dataframe.loc[
df.index[i], (idx, column + "_normality")
] = "False"
elif pvalue > 0.05:
normality.append("True")
thicket.statsframe.dataframe.loc[
df.index[i], (idx, column + "_normality")
] = "True"
else:
normality.append(pd.NA)
thicket.statsframe.dataframe.loc[
df.index[i], (idx, column + "_normality")
] = pd.NA
# check to see if exclusive metric
if (idx, column) in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append((idx, column + "_normality"))
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append((idx, column + "_normality"))

thicket.statsframe.dataframe[(idx, column + "_normality")] = normality

# sort columns in index
thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1)
30 changes: 16 additions & 14 deletions thicket/stats/correlation_nodewise.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,27 +38,28 @@ def correlation_nodewise(thicket, column1=None, column2=None, correlation="pears
)
# thicket object without columnar index
if thicket.dataframe.columns.nlevels == 1:
df = thicket.dataframe.reset_index().groupby("node")
correlated = []
for node in thicket.statsframe.dataframe.index.tolist():
for node, item in df:
if correlation == "pearson":
correlated.append(
stats.pearsonr(
thicket.dataframe.loc[node][column1],
thicket.dataframe.loc[node][column2],
df.get_group(node)[column1],
df.get_group(node)[column2],
)[0]
)
elif correlation == "spearman":
correlated.append(
stats.spearmanr(
thicket.dataframe.loc[node][column1],
thicket.dataframe.loc[node][column2],
df.get_group(node)[column1],
df.get_group(node)[column2],
)[0]
)
elif correlation == "kendall":
correlated.append(
stats.kendalltau(
thicket.dataframe.loc[node][column1],
thicket.dataframe.loc[node][column2],
df.get_group(node)[column1],
df.get_group(node)[column2],
)[0]
)
else:
Expand All @@ -70,27 +71,28 @@ def correlation_nodewise(thicket, column1=None, column2=None, correlation="pears
] = correlated
# columnar joined thicket object
else:
df = thicket.dataframe.reset_index().groupby("node")
correlated = []
for node in thicket.statsframe.dataframe.index.tolist():
for node, item in df:
if correlation == "pearson":
correlated.append(
stats.pearsonr(
thicket.dataframe.loc[node][column1],
thicket.dataframe.loc[node][column2],
df.get_group(node)[column1],
df.get_group(node)[column2],
)[0]
)
elif correlation == "spearman":
correlated.append(
stats.spearmanr(
thicket.dataframe.loc[node][column1],
thicket.dataframe.loc[node][column2],
df.get_group(node)[column1],
df.get_group(node)[column2],
)[0]
)
elif correlation == "kendall":
correlated.append(
stats.kendalltau(
thicket.dataframe.loc[node][column1],
thicket.dataframe.loc[node][column2],
df.get_group(node)[column1],
df.get_group(node)[column2],
)[0]
)
else:
Expand Down
15 changes: 4 additions & 11 deletions thicket/stats/maximum.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
#
# SPDX-License-Identifier: MIT

import pandas as pd

from ..utils import verify_thicket_structures


Expand All @@ -31,32 +29,27 @@ def maximum(thicket, columns=None):

# thicket object without columnar index
if thicket.dataframe.columns.nlevels == 1:
df = thicket.dataframe[columns].reset_index().groupby("node").agg(max)
for column in columns:
maximum = []
for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()):
maximum.append(max(thicket.dataframe.loc[node][column]))
thicket.statsframe.dataframe[column + "_max"] = df[column]
# check to see if exclusive metric
if column in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append(column + "_max")
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append(column + "_max")

thicket.statsframe.dataframe[column + "_max"] = maximum
# columnar joined thicket object
else:
df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(max)
for idx, column in columns:
maximum = []
for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()):
maximum.append(max(thicket.dataframe.loc[node][(idx, column)]))
thicket.statsframe.dataframe[(idx, column + "_max")] = df[(idx, column)]
# check to see if exclusive metric
if (idx, column) in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append((idx, column + "_max"))
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append((idx, column + "_max"))

thicket.statsframe.dataframe[(idx, column + "_max")] = maximum

# sort columns in index
thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1)
17 changes: 6 additions & 11 deletions thicket/stats/mean.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
# SPDX-License-Identifier: MIT

import numpy as np
import pandas as pd

from ..utils import verify_thicket_structures

Expand All @@ -30,32 +29,28 @@ def mean(thicket, columns=None):

# thicket object without columnar index
if thicket.dataframe.columns.nlevels == 1:
df = thicket.dataframe[columns].reset_index().groupby("node").agg(np.mean)
for column in columns:
mean = []
for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()):
mean.append(np.mean(thicket.dataframe.loc[node][column]))
thicket.statsframe.dataframe[column + "_mean"] = df[column]
# check to see if exclusive metric
if column in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append(column + "_mean")
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append(column + "_mean")

thicket.statsframe.dataframe[column + "_mean"] = mean
# columnar joined thicket object
else:
df = (
thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(np.mean)
)
for idx, column in columns:
mean = []
for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()):
mean.append(np.mean(thicket.dataframe.loc[node][(idx, column)]))
thicket.statsframe.dataframe[(idx, column + "_mean")] = df[(idx, column)]
# check to see if exclusive metric
if (idx, column) in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append((idx, column + "_mean"))
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append((idx, column + "_mean"))

thicket.statsframe.dataframe[(idx, column + "_mean")] = mean

# sort columns in index
thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1)
19 changes: 9 additions & 10 deletions thicket/stats/median.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
# SPDX-License-Identifier: MIT

import numpy as np
import pandas as pd

from ..utils import verify_thicket_structures

Expand All @@ -30,32 +29,32 @@ def median(thicket, columns=None):

# thicket object without columnar index
if thicket.dataframe.columns.nlevels == 1:
df = thicket.dataframe[columns].reset_index().groupby("node").agg(np.median)
for column in columns:
median = []
for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()):
median.append(np.median(thicket.dataframe.loc[node][column]))
thicket.statsframe.dataframe[column + "_median"] = df[column]
# check to see if exclusive metric
if column in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append(column + "_median")
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append(column + "_median")

thicket.statsframe.dataframe[column + "_median"] = median
# columnar joined thicket object
else:
df = (
thicket.dataframe[columns]
.reset_index(level=1)
.groupby("node")
.agg(np.median)
)
for idx, column in columns:
median = []
for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()):
median.append(np.median(thicket.dataframe.loc[node][(idx, column)]))
thicket.statsframe.dataframe[(idx, column + "_median")] = df[(idx, column)]
# check to see if exclusive metric
if (idx, column) in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append((idx, column + "_median"))
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append((idx, column + "_median"))

thicket.statsframe.dataframe[(idx, column + "_median")] = median

# sort columns in index
thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1)
16 changes: 4 additions & 12 deletions thicket/stats/minimum.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
#
# SPDX-License-Identifier: MIT

import pandas as pd

from ..utils import verify_thicket_structures


Expand All @@ -31,32 +29,26 @@ def minimum(thicket, columns=None):

# thicket object without columnar index
if thicket.dataframe.columns.nlevels == 1:
df = thicket.dataframe[columns].reset_index().groupby("node").agg(min)
for column in columns:
minimum = []
for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()):
minimum.append(min(thicket.dataframe.loc[node][column]))
thicket.statsframe.dataframe[column + "_min"] = df[column]
# check to see if exclusive metric
if column in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append(column + "_min")
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append(column + "_min")

thicket.statsframe.dataframe[column + "_min"] = minimum
# columnar joined thicket object
else:
df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(min)
for idx, column in columns:
minimum = []
for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()):
minimum.append(min(thicket.dataframe.loc[node][(idx, column)]))
thicket.statsframe.dataframe[(idx, column + "_min")] = df[(idx, column)]
# check to see if exclusive metric
if (idx, column) in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append((idx, column + "_min"))
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append((idx, column + "_min"))

thicket.statsframe.dataframe[(idx, column + "_min")] = minimum

# sort columns in index
thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1)
Loading
Loading