From e45b783bb8b69382690a6973f817633941645d3d Mon Sep 17 00:00:00 2001 From: Treece Alexander Burgess Date: Thu, 20 Jul 2023 13:02:05 -0700 Subject: [PATCH 01/13] Adding group by functionality to mean, median, minimum, and variance --- thicket/stats/maximum.py | 12 ++++-------- thicket/stats/mean.py | 14 ++++---------- thicket/stats/median.py | 13 ++++--------- thicket/stats/minimum.py | 14 ++++---------- thicket/stats/variance.py | 13 ++++--------- 5 files changed, 20 insertions(+), 46 deletions(-) diff --git a/thicket/stats/maximum.py b/thicket/stats/maximum.py index 1e420235..3af9424e 100644 --- a/thicket/stats/maximum.py +++ b/thicket/stats/maximum.py @@ -31,10 +31,9 @@ def maximum(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe.reset_index().groupby("node").agg(max) for column in columns: - maximum = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - maximum.append(max(thicket.dataframe.loc[node][column])) + thicket.statsframe.dataframe[column + "_max"] = df[column] # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_max") @@ -42,13 +41,11 @@ def maximum(thicket, columns=None): else: thicket.statsframe.inc_metrics.append(column + "_max") - thicket.statsframe.dataframe[column + "_max"] = maximum # columnar joined thicket object else: + df = thicket.dataframe.reset_index(level=1).groupby("node").agg(max) for idx, column in columns: - maximum = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - maximum.append(max(thicket.dataframe.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_max")] = df[(idx, column)] # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_max")) @@ -56,7 +53,6 @@ def maximum(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_max")) - thicket.statsframe.dataframe[(idx, column + "_max")] = maximum # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/stats/mean.py b/thicket/stats/mean.py index b07c117e..ee5e44a2 100644 --- a/thicket/stats/mean.py +++ b/thicket/stats/mean.py @@ -30,24 +30,20 @@ def mean(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe.reset_index().groupby("node").agg(np.mean) for column in columns: - mean = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - mean.append(np.mean(thicket.dataframe.loc[node][column])) + thicket.statsframe.dataframe[column + "_mean"] = df[column] # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_mean") # check to see if inclusive metric else: thicket.statsframe.inc_metrics.append(column + "_mean") - - thicket.statsframe.dataframe[column + "_mean"] = mean # columnar joined thicket object else: + df = thicket.dataframe.reset_index(level=1).groupby("node").agg(np.mean) for idx, column in columns: - mean = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - mean.append(np.mean(thicket.dataframe.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_mean")] = df[(idx, column)] # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_mean")) @@ -55,7 +51,5 @@ def mean(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_mean")) - thicket.statsframe.dataframe[(idx, column + "_mean")] = mean - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/stats/median.py b/thicket/stats/median.py index 8c498246..23de5263 100644 --- a/thicket/stats/median.py +++ b/thicket/stats/median.py @@ -30,10 +30,9 @@ def median(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe.reset_index().groupby("node").agg(np.median) for column in columns: - median = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - median.append(np.median(thicket.dataframe.loc[node][column])) + thicket.statsframe.dataframe[column + "_median"] = df[column] # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_median") @@ -41,13 +40,11 @@ def median(thicket, columns=None): else: thicket.statsframe.inc_metrics.append(column + "_median") - thicket.statsframe.dataframe[column + "_median"] = median # columnar joined thicket object else: + df = thicket.dataframe.reset_index(level=1).groupby("node").agg(np.median) for idx, column in columns: - median = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - median.append(np.median(thicket.dataframe.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_median")] = df[(idx, column)] # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_median")) @@ -55,7 +52,5 @@ def median(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_median")) - thicket.statsframe.dataframe[(idx, column + "_median")] = median - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/stats/minimum.py b/thicket/stats/minimum.py index 7add4964..daf4ae78 100644 --- a/thicket/stats/minimum.py +++ b/thicket/stats/minimum.py @@ -31,24 +31,20 @@ def minimum(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe.reset_index().groupby("node").agg(min) for column in columns: - minimum = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - minimum.append(min(thicket.dataframe.loc[node][column])) + thicket.statsframe.dataframe[column + "_min"] = df[column] # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_min") # check to see if inclusive metric else: thicket.statsframe.inc_metrics.append(column + "_min") - - thicket.statsframe.dataframe[column + "_min"] = minimum # columnar joined thicket object else: + df = thicket.dataframe.reset_index(level=1).groupby("node").agg(min) for idx, column in columns: - minimum = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - minimum.append(min(thicket.dataframe.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_min")] = df[(idx, column)] # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_min")) @@ -56,7 +52,5 @@ def minimum(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_min")) - thicket.statsframe.dataframe[(idx, column + "_min")] = minimum - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/stats/variance.py b/thicket/stats/variance.py index f3840501..15ca580b 100644 --- a/thicket/stats/variance.py +++ b/thicket/stats/variance.py @@ -33,31 +33,26 @@ def variance(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe.reset_index().groupby("node").agg(np.var) for column in columns: - var = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - var.append(np.var(thicket.dataframe.loc[node][column])) + thicket.statsframe.dataframe[column + "_var"] = df[column] # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_var") # check to see if inclusive metric else: thicket.statsframe.inc_metrics.append(column + "_var") - - thicket.statsframe.dataframe[column + "_var"] = var # columnar joined thicket object else: + df = thicket.dataframe.reset_index(level=1).groupby("node").agg(np.var) for idx, column in columns: - var = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - var.append(np.var(thicket.dataframe.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_var")] = df[(idx, column)] # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_var")) # check to see if inclusive metric else: thicket.statsframe.inc_metrics.append((idx, column + "_var")) - thicket.statsframe.dataframe[(idx, column + "_var")] = var # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) From bd5a8983a7ea14552e57cb521050f4881c0de747 Mon Sep 17 00:00:00 2001 From: Treece Alexander Burgess Date: Fri, 28 Jul 2023 18:03:38 -0700 Subject: [PATCH 02/13] Adding groupby functionlaity for std.py --- thicket/stats/std.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/thicket/stats/std.py b/thicket/stats/std.py index 7feb44b7..0f87c630 100644 --- a/thicket/stats/std.py +++ b/thicket/stats/std.py @@ -32,24 +32,20 @@ def std(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe.reset_index().groupby("node").agg(np.std) for column in columns: - std = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - std.append(np.std(thicket.dataframe.loc[node][column])) + thicket.statsframe.dataframe[column + "_std"] = df[column] # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_std") # check to see if inclusive metric else: thicket.statsframe.inc_metrics.append(column + "_std") - - thicket.statsframe.dataframe[column + "_std"] = std # columnar joined thicket object else: + df = thicket.dataframe.reset_index(level=1).groupby("node").agg(np.std) for idx, column in columns: - std = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - std.append(np.std(thicket.dataframe.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_std")] = df[(idx, column)] # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_std")) @@ -57,7 +53,5 @@ def std(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_std")) - thicket.statsframe.dataframe[(idx, column + "_std")] = std - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) From ed6108871b06e39d410f8ab233d9ad1f9fa371c6 Mon Sep 17 00:00:00 2001 From: Treece Alexander Burgess Date: Fri, 28 Jul 2023 18:53:28 -0700 Subject: [PATCH 03/13] Adding groupby functionality to check_normality.py --- thicket/stats/check_normality.py | 40 ++++++++++++++------------------ 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/thicket/stats/check_normality.py b/thicket/stats/check_normality.py index 905e637b..b6ed43c0 100644 --- a/thicket/stats/check_normality.py +++ b/thicket/stats/check_normality.py @@ -35,38 +35,36 @@ def check_normality(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe.reset_index().groupby("node").agg(stats.shapiro) for column in columns: - normality = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - pvalue = stats.shapiro(thicket.dataframe.loc[node][column])[1] + for i in len(df[column]): + pvalue = df[column][i].pvalue if pvalue < 0.05: - normality.append("False") + thicket.statsframe.dataframe.loc[df.index[i], column + "_normality"] = "False" elif pvalue > 0.05: - normality.append("True") + thicket.statsframe.dataframe.loc[df.index[i], column + "_normality"] = "True" else: - normality.append(pd.NA) - # check to see if exclusive metric - if column in thicket.exc_metrics: - thicket.statsframe.exc_metrics.append(column + "_normality") - # check to see if inclusive metric - else: - thicket.statsframe.inc_metrics.append(column + "_normality") - - thicket.statsframe.dataframe[column + "_normality"] = normality + thicket.stataframe.dataframe.loc[df.index[i], column + "_normality"] = pd.NA + # check to see if exclusive metric + if column in thicket.exc_metrics: + thicket.statsframe.exc_metrics.append(column + "_normality") + # check to see if inclusive metric + else: + thicket.statsframe.inc_metrics.append(column + "_normality") # columnar joined thicket object else: + df = thicket.dataframe.reset_index(level=1).groupby("node").agg(stats.shapiro) for idx, column in columns: - normality = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - pvalue = stats.shapiro(thicket.dataframe.loc[node][(idx, column)])[1] + for i in len(df[(idx, column)]): + pvalue = df[column][i].pvalue if pvalue < 0.05: - normality.append("False") + thicket.statsframe.dataframe.loc[df.index[i], (idx, column + "_normality")] = "False" elif pvalue > 0.05: - normality.append("True") + thicket.statsframe.dataframe.loc[df.index[i], (idx, column + "_normality")] = "True" else: - normality.append(pd.NA) + thicket.statsframe.dataframe.loc[df.index[i], (idx, column + "_normality")] = pd.NA # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_normality")) @@ -74,7 +72,5 @@ def check_normality(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_normality")) - thicket.statsframe.dataframe[(idx, column + "_normality")] = normality - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) From eb21c50daaf552d4ea8afab192d297fa8a9bb709 Mon Sep 17 00:00:00 2001 From: Treece Alexander Burgess Date: Sat, 29 Jul 2023 21:41:05 -0700 Subject: [PATCH 04/13] Adding groupby functionality to percentiles.py --- thicket/stats/percentiles.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/thicket/stats/percentiles.py b/thicket/stats/percentiles.py index db857af2..1a721a31 100644 --- a/thicket/stats/percentiles.py +++ b/thicket/stats/percentiles.py @@ -39,31 +39,32 @@ def percentiles(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + # select numeric columns within thicket (.quantiles) will not work without this step + numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] + df_num = thicket.dataframe.select_dtypes(include=numerics) + df = df_num.reset_index().groupby("node").quantile([0.25, 0.50, 0.75]) for column in columns: percentiles = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - percentiles.append( - np.percentile(thicket.dataframe.loc[node][column], [25, 50, 75]) - ) + for node in pd.unique(df.reset_index()["node"].tolist()): + percentiles.append(list(df.loc[node][column])) + thicket.statsframe.dataframe[column + "_percentiles"] = percentiles # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_percentiles") # check to see if inclusive metric else: thicket.statsframe.inc_metrics.append(column + "_percentiles") - - thicket.statsframe.dataframe[column + "_percentiles"] = percentiles # columnar joined thicket object else: + numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] + df_num = thicket.dataframe.select_dtypes(include=numerics) + df = df_num.reset_index(level=1).groupby("node").quantile([0.25, 0.50, 0.75]) + percentiles = [] for idx, column in columns: percentiles = [] - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - percentiles.append( - np.percentile( - thicket.dataframe.loc[node][(idx, column)], - [25, 50, 75], - ) - ) + for node in pd.unique(df.reset_index()["node"].tolist()): + percentiles.append(list(df.loc[node][(idx, column)])) + thicket.statsframe.dataframe[(idx, column + "_percentiles")] = percentiles # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_percentiles")) @@ -71,7 +72,5 @@ def percentiles(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_percentiles")) - thicket.statsframe.dataframe[(idx, column + "_percentiles")] = percentiles - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) From 01fa7dfa55651b02c2aa64404eaea41ccd0fc1c1 Mon Sep 17 00:00:00 2001 From: Treece Alexander Burgess Date: Mon, 31 Jul 2023 09:09:12 -0700 Subject: [PATCH 05/13] Updating correlation_nodewise.py to work with groupby functionality --- thicket/stats/correlation_nodewise.py | 30 ++++++++++++++------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/thicket/stats/correlation_nodewise.py b/thicket/stats/correlation_nodewise.py index 86e3f2d5..6969dc91 100644 --- a/thicket/stats/correlation_nodewise.py +++ b/thicket/stats/correlation_nodewise.py @@ -38,27 +38,28 @@ def correlation_nodewise(thicket, column1=None, column2=None, correlation="pears ) # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: + df = thicket.dataframe.reset_index().groupby("node") correlated = [] - for node in thicket.statsframe.dataframe.index.tolist(): + for node, item in df: if correlation == "pearson": correlated.append( stats.pearsonr( - thicket.dataframe.loc[node][column1], - thicket.dataframe.loc[node][column2], + df.get_group(node)[column1], + df.get_group(node)[column2], )[0] ) elif correlation == "spearman": correlated.append( stats.spearmanr( - thicket.dataframe.loc[node][column1], - thicket.dataframe.loc[node][column2], + df.get_group(node)[column1], + df.get_group(node)[column2], )[0] ) elif correlation == "kendall": correlated.append( stats.kendalltau( - thicket.dataframe.loc[node][column1], - thicket.dataframe.loc[node][column2], + df.get_group(node)[column1], + df.get_group(node)[column2], )[0] ) else: @@ -70,27 +71,28 @@ def correlation_nodewise(thicket, column1=None, column2=None, correlation="pears ] = correlated # columnar joined thicket object else: + df = thicket.dataframe.reset_index().groupby("node") correlated = [] - for node in thicket.statsframe.dataframe.index.tolist(): + for node, item in df: if correlation == "pearson": correlated.append( stats.pearsonr( - thicket.dataframe.loc[node][column1], - thicket.dataframe.loc[node][column2], + df.get_group(node)[column1], + df.get_group(node)[column2], )[0] ) elif correlation == "spearman": correlated.append( stats.spearmanr( - thicket.dataframe.loc[node][column1], - thicket.dataframe.loc[node][column2], + df.get_group(node)[column1], + df.get_group(node)[column2], )[0] ) elif correlation == "kendall": correlated.append( stats.kendalltau( - thicket.dataframe.loc[node][column1], - thicket.dataframe.loc[node][column2], + df.get_group(node)[column1], + df.get_group(node)[column2], )[0] ) else: From aa3f4287985c3bf47df2ce1f8b6db5483c2fcc37 Mon Sep 17 00:00:00 2001 From: Treece Alexander Burgess Date: Mon, 31 Jul 2023 09:39:18 -0700 Subject: [PATCH 06/13] Adding groupby functionality to calc_boxplot_statistics.py --- thicket/stats/calc_boxplot_statistics.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/thicket/stats/calc_boxplot_statistics.py b/thicket/stats/calc_boxplot_statistics.py index d26eeb57..51055387 100644 --- a/thicket/stats/calc_boxplot_statistics.py +++ b/thicket/stats/calc_boxplot_statistics.py @@ -51,8 +51,9 @@ def calc_boxplot_statistics(thicket, columns=[], quartiles=[0.25, 0.5, 0.75], ** col + "_outliers" + q_list: [], } - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - values = thicket.dataframe.loc[node][col].tolist() + df = thicket.dataframe.reset_index().groupby("node") + for node, item in df: + values = df.get_group(node)[col].tolist() q = np.quantile(values, quartiles) q1 = q[0] @@ -107,8 +108,9 @@ def calc_boxplot_statistics(thicket, columns=[], quartiles=[0.25, 0.5, 0.75], ** } } - for node in pd.unique(thicket.dataframe.reset_index()["node"].tolist()): - values = thicket.dataframe.loc[node][(idx, col)].tolist() + df = thicket.dataframe.reset_index().groupby("node") + for node, item in df: + values = df.get_group(node)[(idx, col)].tolist() q = np.quantile(values, quartiles) q1 = q[0] From 75410e1bf09db7b8c8e31e58ff04f5b1878555ba Mon Sep 17 00:00:00 2001 From: Treece Alexander Burgess Date: Mon, 31 Jul 2023 10:16:55 -0700 Subject: [PATCH 07/13] Black/flake8 formatting changes --- thicket/stats/check_normality.py | 24 ++++++++++++++++++------ thicket/stats/maximum.py | 3 --- thicket/stats/mean.py | 1 - thicket/stats/median.py | 1 - thicket/stats/minimum.py | 2 -- thicket/stats/percentiles.py | 5 ++--- thicket/stats/std.py | 1 - thicket/stats/variance.py | 1 - 8 files changed, 20 insertions(+), 18 deletions(-) diff --git a/thicket/stats/check_normality.py b/thicket/stats/check_normality.py index b6ed43c0..796bd856 100644 --- a/thicket/stats/check_normality.py +++ b/thicket/stats/check_normality.py @@ -41,11 +41,17 @@ def check_normality(thicket, columns=None): pvalue = df[column][i].pvalue if pvalue < 0.05: - thicket.statsframe.dataframe.loc[df.index[i], column + "_normality"] = "False" + thicket.statsframe.dataframe.loc[ + df.index[i], column + "_normality" + ] = "False" elif pvalue > 0.05: - thicket.statsframe.dataframe.loc[df.index[i], column + "_normality"] = "True" + thicket.statsframe.dataframe.loc[ + df.index[i], column + "_normality" + ] = "True" else: - thicket.stataframe.dataframe.loc[df.index[i], column + "_normality"] = pd.NA + thicket.stataframe.dataframe.loc[ + df.index[i], column + "_normality" + ] = pd.NA # check to see if exclusive metric if column in thicket.exc_metrics: thicket.statsframe.exc_metrics.append(column + "_normality") @@ -60,11 +66,17 @@ def check_normality(thicket, columns=None): pvalue = df[column][i].pvalue if pvalue < 0.05: - thicket.statsframe.dataframe.loc[df.index[i], (idx, column + "_normality")] = "False" + thicket.statsframe.dataframe.loc[ + df.index[i], (idx, column + "_normality") + ] = "False" elif pvalue > 0.05: - thicket.statsframe.dataframe.loc[df.index[i], (idx, column + "_normality")] = "True" + thicket.statsframe.dataframe.loc[ + df.index[i], (idx, column + "_normality") + ] = "True" else: - thicket.statsframe.dataframe.loc[df.index[i], (idx, column + "_normality")] = pd.NA + thicket.statsframe.dataframe.loc[ + df.index[i], (idx, column + "_normality") + ] = pd.NA # check to see if exclusive metric if (idx, column) in thicket.exc_metrics: thicket.statsframe.exc_metrics.append((idx, column + "_normality")) diff --git a/thicket/stats/maximum.py b/thicket/stats/maximum.py index 3af9424e..803f189c 100644 --- a/thicket/stats/maximum.py +++ b/thicket/stats/maximum.py @@ -3,8 +3,6 @@ # # SPDX-License-Identifier: MIT -import pandas as pd - from ..utils import verify_thicket_structures @@ -53,6 +51,5 @@ def maximum(thicket, columns=None): else: thicket.statsframe.inc_metrics.append((idx, column + "_max")) - # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/stats/mean.py b/thicket/stats/mean.py index ee5e44a2..8a28979b 100644 --- a/thicket/stats/mean.py +++ b/thicket/stats/mean.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: MIT import numpy as np -import pandas as pd from ..utils import verify_thicket_structures diff --git a/thicket/stats/median.py b/thicket/stats/median.py index 23de5263..ab06f4e8 100644 --- a/thicket/stats/median.py +++ b/thicket/stats/median.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: MIT import numpy as np -import pandas as pd from ..utils import verify_thicket_structures diff --git a/thicket/stats/minimum.py b/thicket/stats/minimum.py index daf4ae78..ed24f558 100644 --- a/thicket/stats/minimum.py +++ b/thicket/stats/minimum.py @@ -3,8 +3,6 @@ # # SPDX-License-Identifier: MIT -import pandas as pd - from ..utils import verify_thicket_structures diff --git a/thicket/stats/percentiles.py b/thicket/stats/percentiles.py index 1a721a31..81fec98f 100644 --- a/thicket/stats/percentiles.py +++ b/thicket/stats/percentiles.py @@ -3,7 +3,6 @@ # # SPDX-License-Identifier: MIT -import numpy as np import pandas as pd from ..utils import verify_thicket_structures @@ -40,7 +39,7 @@ def percentiles(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: # select numeric columns within thicket (.quantiles) will not work without this step - numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] + numerics = ["int16", "int32", "int64", "float16", "float32", "float64"] df_num = thicket.dataframe.select_dtypes(include=numerics) df = df_num.reset_index().groupby("node").quantile([0.25, 0.50, 0.75]) for column in columns: @@ -56,7 +55,7 @@ def percentiles(thicket, columns=None): thicket.statsframe.inc_metrics.append(column + "_percentiles") # columnar joined thicket object else: - numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] + numerics = ["int16", "int32", "int64", "float16", "float32", "float64"] df_num = thicket.dataframe.select_dtypes(include=numerics) df = df_num.reset_index(level=1).groupby("node").quantile([0.25, 0.50, 0.75]) percentiles = [] diff --git a/thicket/stats/std.py b/thicket/stats/std.py index 0f87c630..faa81b46 100644 --- a/thicket/stats/std.py +++ b/thicket/stats/std.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: MIT import numpy as np -import pandas as pd from ..utils import verify_thicket_structures diff --git a/thicket/stats/variance.py b/thicket/stats/variance.py index 15ca580b..c734ac93 100644 --- a/thicket/stats/variance.py +++ b/thicket/stats/variance.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: MIT import numpy as np -import pandas as pd from ..utils import verify_thicket_structures From a23ed09073469ea2f97bd7b9a50edd0e3365ec05 Mon Sep 17 00:00:00 2001 From: Treece Alexander Burgess Date: Mon, 31 Jul 2023 13:35:11 -0700 Subject: [PATCH 08/13] Fixing check_normality.py --- thicket/stats/check_normality.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/thicket/stats/check_normality.py b/thicket/stats/check_normality.py index 796bd856..ea333fae 100644 --- a/thicket/stats/check_normality.py +++ b/thicket/stats/check_normality.py @@ -35,9 +35,14 @@ def check_normality(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: - df = thicket.dataframe.reset_index().groupby("node").agg(stats.shapiro) + df = ( + thicket.dataframe.drop(columns="name") + .reset_index() + .groupby("node") + .agg(stats.shapiro) + ) for column in columns: - for i in len(df[column]): + for i in range(0, len(df[column])): pvalue = df[column][i].pvalue if pvalue < 0.05: @@ -60,10 +65,15 @@ def check_normality(thicket, columns=None): thicket.statsframe.inc_metrics.append(column + "_normality") # columnar joined thicket object else: - df = thicket.dataframe.reset_index(level=1).groupby("node").agg(stats.shapiro) + df = ( + thicket.dataframe.drop(columns=("name", "")) + .reset_index(level=1) + .groupby("node") + .agg(stats.shapiro) + ) for idx, column in columns: - for i in len(df[(idx, column)]): - pvalue = df[column][i].pvalue + for i in range(0, len(df[(idx, column)])): + pvalue = df[(idx, column)][i].pvalue if pvalue < 0.05: thicket.statsframe.dataframe.loc[ From ff452578ce9147fe6932ef6cba301a24e380d98e Mon Sep 17 00:00:00 2001 From: Michael Richard McKinsey Date: Wed, 30 Aug 2023 16:34:01 -0500 Subject: [PATCH 09/13] Bugfix functions --- thicket/stats/mean.py | 4 ++-- thicket/stats/variance.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/thicket/stats/mean.py b/thicket/stats/mean.py index 8a28979b..8c83fc90 100644 --- a/thicket/stats/mean.py +++ b/thicket/stats/mean.py @@ -29,7 +29,7 @@ def mean(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: - df = thicket.dataframe.reset_index().groupby("node").agg(np.mean) + df = thicket.dataframe[columns].reset_index().groupby("node").agg(np.mean) for column in columns: thicket.statsframe.dataframe[column + "_mean"] = df[column] # check to see if exclusive metric @@ -40,7 +40,7 @@ def mean(thicket, columns=None): thicket.statsframe.inc_metrics.append(column + "_mean") # columnar joined thicket object else: - df = thicket.dataframe.reset_index(level=1).groupby("node").agg(np.mean) + df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(np.mean) for idx, column in columns: thicket.statsframe.dataframe[(idx, column + "_mean")] = df[(idx, column)] # check to see if exclusive metric diff --git a/thicket/stats/variance.py b/thicket/stats/variance.py index c734ac93..4b87f6cc 100644 --- a/thicket/stats/variance.py +++ b/thicket/stats/variance.py @@ -32,7 +32,7 @@ def variance(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: - df = thicket.dataframe.reset_index().groupby("node").agg(np.var) + df = thicket.dataframe[columns].reset_index().groupby("node").agg(np.var) for column in columns: thicket.statsframe.dataframe[column + "_var"] = df[column] # check to see if exclusive metric @@ -43,7 +43,7 @@ def variance(thicket, columns=None): thicket.statsframe.inc_metrics.append(column + "_var") # columnar joined thicket object else: - df = thicket.dataframe.reset_index(level=1).groupby("node").agg(np.var) + df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(np.var) for idx, column in columns: thicket.statsframe.dataframe[(idx, column + "_var")] = df[(idx, column)] # check to see if exclusive metric From 201946e2da4682cb228f0160da580015a5b4ff33 Mon Sep 17 00:00:00 2001 From: Michael Richard McKinsey Date: Wed, 30 Aug 2023 16:41:57 -0500 Subject: [PATCH 10/13] Bugfix remaining functions --- thicket/stats/maximum.py | 4 ++-- thicket/stats/median.py | 4 ++-- thicket/stats/minimum.py | 4 ++-- thicket/stats/std.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/thicket/stats/maximum.py b/thicket/stats/maximum.py index 803f189c..6151fd52 100644 --- a/thicket/stats/maximum.py +++ b/thicket/stats/maximum.py @@ -29,7 +29,7 @@ def maximum(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: - df = thicket.dataframe.reset_index().groupby("node").agg(max) + df = thicket.dataframe[columns].reset_index().groupby("node").agg(max) for column in columns: thicket.statsframe.dataframe[column + "_max"] = df[column] # check to see if exclusive metric @@ -41,7 +41,7 @@ def maximum(thicket, columns=None): # columnar joined thicket object else: - df = thicket.dataframe.reset_index(level=1).groupby("node").agg(max) + df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(max) for idx, column in columns: thicket.statsframe.dataframe[(idx, column + "_max")] = df[(idx, column)] # check to see if exclusive metric diff --git a/thicket/stats/median.py b/thicket/stats/median.py index ab06f4e8..0c17ddbb 100644 --- a/thicket/stats/median.py +++ b/thicket/stats/median.py @@ -29,7 +29,7 @@ def median(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: - df = thicket.dataframe.reset_index().groupby("node").agg(np.median) + df = thicket.dataframe[columns].reset_index().groupby("node").agg(np.median) for column in columns: thicket.statsframe.dataframe[column + "_median"] = df[column] # check to see if exclusive metric @@ -41,7 +41,7 @@ def median(thicket, columns=None): # columnar joined thicket object else: - df = thicket.dataframe.reset_index(level=1).groupby("node").agg(np.median) + df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(np.median) for idx, column in columns: thicket.statsframe.dataframe[(idx, column + "_median")] = df[(idx, column)] # check to see if exclusive metric diff --git a/thicket/stats/minimum.py b/thicket/stats/minimum.py index ed24f558..54dac892 100644 --- a/thicket/stats/minimum.py +++ b/thicket/stats/minimum.py @@ -29,7 +29,7 @@ def minimum(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: - df = thicket.dataframe.reset_index().groupby("node").agg(min) + df = thicket.dataframe[columns].reset_index().groupby("node").agg(min) for column in columns: thicket.statsframe.dataframe[column + "_min"] = df[column] # check to see if exclusive metric @@ -40,7 +40,7 @@ def minimum(thicket, columns=None): thicket.statsframe.inc_metrics.append(column + "_min") # columnar joined thicket object else: - df = thicket.dataframe.reset_index(level=1).groupby("node").agg(min) + df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(min) for idx, column in columns: thicket.statsframe.dataframe[(idx, column + "_min")] = df[(idx, column)] # check to see if exclusive metric diff --git a/thicket/stats/std.py b/thicket/stats/std.py index faa81b46..e2244161 100644 --- a/thicket/stats/std.py +++ b/thicket/stats/std.py @@ -31,7 +31,7 @@ def std(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: - df = thicket.dataframe.reset_index().groupby("node").agg(np.std) + df = thicket.dataframe[columns].reset_index().groupby("node").agg(np.std) for column in columns: thicket.statsframe.dataframe[column + "_std"] = df[column] # check to see if exclusive metric @@ -42,7 +42,7 @@ def std(thicket, columns=None): thicket.statsframe.inc_metrics.append(column + "_std") # columnar joined thicket object else: - df = thicket.dataframe.reset_index(level=1).groupby("node").agg(np.std) + df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(np.std) for idx, column in columns: thicket.statsframe.dataframe[(idx, column + "_std")] = df[(idx, column)] # check to see if exclusive metric From 8265da6966de6e1cb22db8d4625b9eb4821e820a Mon Sep 17 00:00:00 2001 From: Michael Richard McKinsey Date: Wed, 30 Aug 2023 16:42:41 -0500 Subject: [PATCH 11/13] Black --- thicket/stats/mean.py | 4 +++- thicket/stats/median.py | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/thicket/stats/mean.py b/thicket/stats/mean.py index 8c83fc90..8ab670eb 100644 --- a/thicket/stats/mean.py +++ b/thicket/stats/mean.py @@ -40,7 +40,9 @@ def mean(thicket, columns=None): thicket.statsframe.inc_metrics.append(column + "_mean") # columnar joined thicket object else: - df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(np.mean) + df = ( + thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(np.mean) + ) for idx, column in columns: thicket.statsframe.dataframe[(idx, column + "_mean")] = df[(idx, column)] # check to see if exclusive metric diff --git a/thicket/stats/median.py b/thicket/stats/median.py index 0c17ddbb..c57b552a 100644 --- a/thicket/stats/median.py +++ b/thicket/stats/median.py @@ -41,7 +41,12 @@ def median(thicket, columns=None): # columnar joined thicket object else: - df = thicket.dataframe[columns].reset_index(level=1).groupby("node").agg(np.median) + df = ( + thicket.dataframe[columns] + .reset_index(level=1) + .groupby("node") + .agg(np.median) + ) for idx, column in columns: thicket.statsframe.dataframe[(idx, column + "_median")] = df[(idx, column)] # check to see if exclusive metric From d94a79b06dd686b94da9da76f28c640f991e4d5a Mon Sep 17 00:00:00 2001 From: Michael Richard McKinsey Date: Wed, 6 Sep 2023 12:36:22 -0500 Subject: [PATCH 12/13] Only aggregate numerical columns --- thicket/stats/check_normality.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/thicket/stats/check_normality.py b/thicket/stats/check_normality.py index ea333fae..689d0131 100644 --- a/thicket/stats/check_normality.py +++ b/thicket/stats/check_normality.py @@ -36,7 +36,7 @@ def check_normality(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: df = ( - thicket.dataframe.drop(columns="name") + thicket.dataframe.select_dtypes(include='number') .reset_index() .groupby("node") .agg(stats.shapiro) @@ -66,7 +66,7 @@ def check_normality(thicket, columns=None): # columnar joined thicket object else: df = ( - thicket.dataframe.drop(columns=("name", "")) + thicket.dataframe.select_dtypes(include='number') .reset_index(level=1) .groupby("node") .agg(stats.shapiro) From e3770e6ef4d267ad825960c40506dabcd9a2921e Mon Sep 17 00:00:00 2001 From: Michael Richard McKinsey Date: Wed, 6 Sep 2023 12:38:59 -0500 Subject: [PATCH 13/13] Black --- thicket/stats/check_normality.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/thicket/stats/check_normality.py b/thicket/stats/check_normality.py index 689d0131..e8d55633 100644 --- a/thicket/stats/check_normality.py +++ b/thicket/stats/check_normality.py @@ -36,7 +36,7 @@ def check_normality(thicket, columns=None): # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: df = ( - thicket.dataframe.select_dtypes(include='number') + thicket.dataframe.select_dtypes(include="number") .reset_index() .groupby("node") .agg(stats.shapiro) @@ -66,7 +66,7 @@ def check_normality(thicket, columns=None): # columnar joined thicket object else: df = ( - thicket.dataframe.select_dtypes(include='number') + thicket.dataframe.select_dtypes(include="number") .reset_index(level=1) .groupby("node") .agg(stats.shapiro)