diff --git a/thicket/stats/calc_boxplot_statistics.py b/thicket/stats/calc_boxplot_statistics.py index d26eeb57..51055387 100644 --- a/thicket/stats/calc_boxplot_statistics.py +++ b/thicket/stats/calc_boxplot_statistics.py @@ -51,8 +51,9 @@ def calc_boxplot_statistics(thicket, columns=[], quartiles=[0.25, 0.5, 0.75], ** col + "_outliers" + q_list: [], } - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - values = thicket.dataframe.loc[node][col].tolist() + df = thicket.dataframe.reset_index().groupby("node") + for node, item in df: + values = df.get_group(node)[col].tolist() q = np.quantile(values, quartiles) q1 = q[0] @@ -107,8 +108,9 @@ def calc_boxplot_statistics(thicket, columns=[], quartiles=[0.25, 0.5, 0.75], ** } } - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - values = thicket.dataframe.loc[node][(idx, col)].tolist() + df = thicket.dataframe.reset_index().groupby("node") + for node, item in df: + values = df.get_group(node)[(idx, col)].tolist() q = np.quantile(values, quartiles) q1 = q[0] diff --git a/thicket/stats/check_normality.py b/thicket/stats/check_normality.py index 905e637b..e8d55633 100644 --- a/thicket/stats/check_normality.py +++ b/thicket/stats/check_normality.py @@ -35,38 +35,58 @@ def check_normality(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = ( + thicket.dataframe.select_dtypes(include="number") + .reset_index() + .groupby("node") + .agg(stats.shapiro) + ) for column in columns: - normality = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - pvalue = stats.shapiro(thicket.dataframe.loc[node][column])[1] + for i in range(0, len(df[column])): + pvalue = df[column][i].pvalue if pvalue < 0.05: - normality.append("False") + thicket.statsframe.dataframe.loc[ + df.index[i], column + "_normality" + ] = "False" elif pvalue > 0.05: - normality.append("True") + thicket.statsframe.dataframe.loc[ + df.index[i], column + "_normality" + ] = "True" else: - normality.append(pd.NA) - # check to see if exclusive metric - if column in thicket.exc_metrics: - thicket.statsframe.exc_metrics.append(column + "_normality") - # check to see if inclusive metric - else: - thicket.statsframe.inc_metrics.append(column + "_normality") - - thicket.statsframe.dataframe[column + "_normality"] = normality + thicket.stataframe.dataframe.loc[ + df.index[i], column + "_normality" + ] = pd.NA + # check to see if exclusive metric + if column in thicket.exc_metrics: + thicket.statsframe.exc_metrics.append(column + "_normality") + # check to see if inclusive metric + else: + thicket.statsframe.inc_metrics.append(column + "_normality") # columnar joined thicket object else: + df = ( + thicket.dataframe.select_dtypes(include="number") + .reset_index(level=1) + .groupby("node") + .agg(stats.shapiro) + ) for idx, column in columns: - normality = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - pvalue = stats.shapiro(thicket.dataframe.loc[node][(idx, column)])[1] + for i in range(0, len(df[(idx, column)])): + pvalue = df[(idx, column)][i].pvalue if pvalue < 0.05: - normality.append("False") + thicket.statsframe.dataframe.loc[ + df.index[i], (idx, column + "_normality") + ] = "False" elif pvalue > 0.05: - normality.append("True") + thicket.statsframe.dataframe.loc[ + df.index[i], (idx, column + "_normality") + ] = "True" else: - normality.append(pd.NA) + thicket.statsframe.dataframe.loc[ + df.index[i], (idx, column + "_normality") + ] = pd.NA # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_normality")) @@ -74,7 +94,5 @@ def check_normality(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_normality")) - thicket.statsframe.dataframe[(idx, column + "_normality")] = normality - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/stats/correlation_nodewise.py b/thicket/stats/correlation_nodewise.py index 86e3f2d5..6969dc91 100644 --- a/thicket/stats/correlation_nodewise.py +++ b/thicket/stats/correlation_nodewise.py @@ -38,27 +38,28 @@ def correlation_nodewise(thicket, column1=None, column2=None, correlation="pears ) # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe.reset_index().groupby("node") correlated = [] - for node in thicket.statsframe.dataframe.index.tolist(): + for node, item in df: if correlation == "pearson": correlated.append( stats.pearsonr( - thicket.dataframe.loc[node][column1], - thicket.dataframe.loc[node][column2], + df.get_group(node)[column1], + df.get_group(node)[column2], )[0] ) elif correlation == "spearman": correlated.append( stats.spearmanr( - thicket.dataframe.loc[node][column1], - thicket.dataframe.loc[node][column2], + df.get_group(node)[column1], + df.get_group(node)[column2], )[0] ) elif correlation == "kendall": correlated.append( stats.kendalltau( - thicket.dataframe.loc[node][column1], - thicket.dataframe.loc[node][column2], + df.get_group(node)[column1], + df.get_group(node)[column2], )[0] ) else: @@ -70,27 +71,28 @@ def correlation_nodewise(thicket, column1=None, column2=None, correlation="pears ] = correlated # columnar joined thicket object else: + df = thicket.dataframe.reset_index().groupby("node") correlated = [] - for node in thicket.statsframe.dataframe.index.tolist(): + for node, item in df: if correlation == "pearson": correlated.append( stats.pearsonr( - thicket.dataframe.loc[node][column1], - thicket.dataframe.loc[node][column2], + df.get_group(node)[column1], + df.get_group(node)[column2], )[0] ) elif correlation == "spearman": correlated.append( stats.spearmanr( - thicket.dataframe.loc[node][column1], - thicket.dataframe.loc[node][column2], + df.get_group(node)[column1], + df.get_group(node)[column2], )[0] ) elif correlation == "kendall": correlated.append( stats.kendalltau( - thicket.dataframe.loc[node][column1], - thicket.dataframe.loc[node][column2], + df.get_group(node)[column1], + df.get_group(node)[column2], )[0] ) else: diff --git a/thicket/stats/maximum.py b/thicket/stats/maximum.py index 1e420235..6151fd52 100644 --- a/thicket/stats/maximum.py +++ b/thicket/stats/maximum.py @@ -3,8 +3,6 @@ # # SPDX-License-Identifier: MIT -import pandas as pd - from ..utils import verify_thicket_structures @@ -31,10 +29,9 @@ def maximum(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe[columns].reset_index().groupby("node").agg(max) for column in columns: - maximum = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - maximum.append(max(thicket.dataframe.loc[node][column])) + thicket.statsframe.dataframe[column + "_max"] = df[column] # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_max") @@ -42,13 +39,11 @@ def maximum(thicket, columns=None): else: thicket.statsframe.inc_metrics.append(column + "_max") - thicket.statsframe.dataframe[column + "_max"] = maximum # columnar joined thicket object else: + df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(max) for idx, column in columns: - maximum = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - maximum.append(max(thicket.dataframe.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_max")] = df[(idx, column)] # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_max")) @@ -56,7 +51,5 @@ def maximum(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_max")) - thicket.statsframe.dataframe[(idx, column + "_max")] = maximum - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/stats/mean.py b/thicket/stats/mean.py index b07c117e..8ab670eb 100644 --- a/thicket/stats/mean.py +++ b/thicket/stats/mean.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: MIT import numpy as np -import pandas as pd from ..utils import verify_thicket_structures @@ -30,24 +29,22 @@ def mean(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe[columns].reset_index().groupby("node").agg(np.mean) for column in columns: - mean = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - mean.append(np.mean(thicket.dataframe.loc[node][column])) + thicket.statsframe.dataframe[column + "_mean"] = df[column] # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_mean") # check to see if inclusive metric else: thicket.statsframe.inc_metrics.append(column + "_mean") - - thicket.statsframe.dataframe[column + "_mean"] = mean # columnar joined thicket object else: + df = ( + thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(np.mean) + ) for idx, column in columns: - mean = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - mean.append(np.mean(thicket.dataframe.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_mean")] = df[(idx, column)] # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_mean")) @@ -55,7 +52,5 @@ def mean(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_mean")) - thicket.statsframe.dataframe[(idx, column + "_mean")] = mean - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/stats/median.py b/thicket/stats/median.py index 8c498246..c57b552a 100644 --- a/thicket/stats/median.py +++ b/thicket/stats/median.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: MIT import numpy as np -import pandas as pd from ..utils import verify_thicket_structures @@ -30,10 +29,9 @@ def median(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe[columns].reset_index().groupby("node").agg(np.median) for column in columns: - median = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - median.append(np.median(thicket.dataframe.loc[node][column])) + thicket.statsframe.dataframe[column + "_median"] = df[column] # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_median") @@ -41,13 +39,16 @@ def median(thicket, columns=None): else: thicket.statsframe.inc_metrics.append(column + "_median") - thicket.statsframe.dataframe[column + "_median"] = median # columnar joined thicket object else: + df = ( + thicket.dataframe[columns] + .reset_index(level=1) + .groupby("node") + .agg(np.median) + ) for idx, column in columns: - median = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - median.append(np.median(thicket.dataframe.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_median")] = df[(idx, column)] # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_median")) @@ -55,7 +56,5 @@ def median(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_median")) - thicket.statsframe.dataframe[(idx, column + "_median")] = median - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/stats/minimum.py b/thicket/stats/minimum.py index 7add4964..54dac892 100644 --- a/thicket/stats/minimum.py +++ b/thicket/stats/minimum.py @@ -3,8 +3,6 @@ # # SPDX-License-Identifier: MIT -import pandas as pd - from ..utils import verify_thicket_structures @@ -31,24 +29,20 @@ def minimum(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe[columns].reset_index().groupby("node").agg(min) for column in columns: - minimum = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - minimum.append(min(thicket.dataframe.loc[node][column])) + thicket.statsframe.dataframe[column + "_min"] = df[column] # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_min") # check to see if inclusive metric else: thicket.statsframe.inc_metrics.append(column + "_min") - - thicket.statsframe.dataframe[column + "_min"] = minimum # columnar joined thicket object else: + df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(min) for idx, column in columns: - minimum = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - minimum.append(min(thicket.dataframe.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_min")] = df[(idx, column)] # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_min")) @@ -56,7 +50,5 @@ def minimum(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_min")) - thicket.statsframe.dataframe[(idx, column + "_min")] = minimum - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/stats/percentiles.py b/thicket/stats/percentiles.py index db857af2..81fec98f 100644 --- a/thicket/stats/percentiles.py +++ b/thicket/stats/percentiles.py @@ -3,7 +3,6 @@ # # SPDX-License-Identifier: MIT -import numpy as np import pandas as pd from ..utils import verify_thicket_structures @@ -39,31 +38,32 @@ def percentiles(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + # select numeric columns within thicket (.quantiles) will not work without this step + numerics = ["int16", "int32", "int64", "float16", "float32", "float64"] + df_num = thicket.dataframe.select_dtypes(include=numerics) + df = df_num.reset_index().groupby("node").quantile([0.25, 0.50, 0.75]) for column in columns: percentiles = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - percentiles.append( - np.percentile(thicket.dataframe.loc[node][column], [25, 50, 75]) - ) + for node in pd.unique(df.reset_index()["node"].tolist()): + percentiles.append(list(df.loc[node][column])) + thicket.statsframe.dataframe[column + "_percentiles"] = percentiles # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_percentiles") # check to see if inclusive metric else: thicket.statsframe.inc_metrics.append(column + "_percentiles") - - thicket.statsframe.dataframe[column + "_percentiles"] = percentiles # columnar joined thicket object else: + numerics = ["int16", "int32", "int64", "float16", "float32", "float64"] + df_num = thicket.dataframe.select_dtypes(include=numerics) + df = df_num.reset_index(level=1).groupby("node").quantile([0.25, 0.50, 0.75]) + percentiles = [] for idx, column in columns: percentiles = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - percentiles.append( - np.percentile( - thicket.dataframe.loc[node][(idx, column)], - [25, 50, 75], - ) - ) + for node in pd.unique(df.reset_index()["node"].tolist()): + percentiles.append(list(df.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_percentiles")] = percentiles # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_percentiles")) @@ -71,7 +71,5 @@ def percentiles(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_percentiles")) - thicket.statsframe.dataframe[(idx, column + "_percentiles")] = percentiles - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/stats/std.py b/thicket/stats/std.py index 7feb44b7..e2244161 100644 --- a/thicket/stats/std.py +++ b/thicket/stats/std.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: MIT import numpy as np -import pandas as pd from ..utils import verify_thicket_structures @@ -32,24 +31,20 @@ def std(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe[columns].reset_index().groupby("node").agg(np.std) for column in columns: - std = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - std.append(np.std(thicket.dataframe.loc[node][column])) + thicket.statsframe.dataframe[column + "_std"] = df[column] # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_std") # check to see if inclusive metric else: thicket.statsframe.inc_metrics.append(column + "_std") - - thicket.statsframe.dataframe[column + "_std"] = std # columnar joined thicket object else: + df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(np.std) for idx, column in columns: - std = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - std.append(np.std(thicket.dataframe.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_std")] = df[(idx, column)] # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_std")) @@ -57,7 +52,5 @@ def std(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_std")) - thicket.statsframe.dataframe[(idx, column + "_std")] = std - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/stats/variance.py b/thicket/stats/variance.py index f3840501..4b87f6cc 100644 --- a/thicket/stats/variance.py +++ b/thicket/stats/variance.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: MIT import numpy as np -import pandas as pd from ..utils import verify_thicket_structures @@ -33,31 +32,26 @@ def variance(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe[columns].reset_index().groupby("node").agg(np.var) for column in columns: - var = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - var.append(np.var(thicket.dataframe.loc[node][column])) + thicket.statsframe.dataframe[column + "_var"] = df[column] # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_var") # check to see if inclusive metric else: thicket.statsframe.inc_metrics.append(column + "_var") - - thicket.statsframe.dataframe[column + "_var"] = var # columnar joined thicket object else: + df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(np.var) for idx, column in columns: - var = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - var.append(np.var(thicket.dataframe.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_var")] = df[(idx, column)] # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_var")) # check to see if inclusive metric else: thicket.statsframe.inc_metrics.append((idx, column + "_var")) - thicket.statsframe.dataframe[(idx, column + "_var")] = var # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1)