From 37049b54f47a2f96447974f74432e06329fa05fd Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Tue, 22 Oct 2024 15:57:07 -0500 Subject: [PATCH 1/3] Improve metadata_column_to_perfdata to add multiple columns at the same time --- thicket/ensemble.py | 4 +- thicket/groupby.py | 2 +- thicket/tests/test_thicket.py | 79 +++++++++++++++++++++++++---------- thicket/tests/test_tree.py | 3 +- thicket/thicket.py | 53 ++++++++++++++--------- 5 files changed, 95 insertions(+), 46 deletions(-) diff --git a/thicket/ensemble.py b/thicket/ensemble.py index a3960569..f0f77bd1 100644 --- a/thicket/ensemble.py +++ b/thicket/ensemble.py @@ -210,7 +210,7 @@ def _handle_perfdata(): new_profiles = [i for i in range(len(thickets_cp[0].profile))] for i in range(len(thickets_cp)): thickets_cp[i].metadata["new_profiles"] = new_profiles - thickets_cp[i].metadata_column_to_perfdata( + thickets_cp[i].metadata_columns_to_perfdata( "new_profiles", drop=True ) thickets_cp[i].dataframe.reset_index(level=inner_idx, inplace=True) @@ -233,7 +233,7 @@ def _handle_perfdata(): else: # Change second-level index to be from metadata's "metadata_key" column for i in range(len(thickets_cp)): if metadata_key not in thickets_cp[i].dataframe.index.names: - thickets_cp[i].metadata_column_to_perfdata(metadata_key) + thickets_cp[i].metadata_columns_to_perfdata(metadata_key) thickets_cp[i].dataframe.reset_index(level=inner_idx, inplace=True) new_mappings.update( pd.Series( diff --git a/thicket/groupby.py b/thicket/groupby.py index 56b3c1ed..a90540d7 100644 --- a/thicket/groupby.py +++ b/thicket/groupby.py @@ -83,7 +83,7 @@ def _agg_rows(col_series): if col not in index_names: if col in tk_c.metadata.columns or col in df_columns: if col not in df_columns: - tk_c.metadata_column_to_perfdata(col) + tk_c.metadata_columns_to_perfdata(col) tk_c.dataframe = tk_c.dataframe.set_index(col, append=True) else: raise KeyError(f'"{col}" is not in the PerfData or MetaData.') diff --git a/thicket/tests/test_thicket.py b/thicket/tests/test_thicket.py index 4a2ff5b8..c217955c 100644 --- a/thicket/tests/test_thicket.py +++ b/thicket/tests/test_thicket.py @@ -80,28 +80,63 @@ def _test_multiindex(): assert bool(re.search("1.000.*Basic_COPY8", tree_output)) -def test_metadata_column_to_perfdata(mpi_scaling_cali): - t_ens = Thicket.from_caliperreader(mpi_scaling_cali, disable_tqdm=True) - - example_column = "jobsize" - example_column_metrics = [27, 64, 125, 216, 343] - - # Column should be in metadata table - assert example_column in t_ens.metadata - # Column should not be in performance data table - assert example_column not in t_ens.dataframe - # Assume second level index is profile - assert t_ens.dataframe.index.names[1] == "profile" - - t_ens.metadata_column_to_perfdata(example_column) - - # Column should be in performance data table - assert example_column in t_ens.dataframe - - # Check that the metrics exist in the performance data table - values = t_ens.dataframe[example_column].values.astype("int") - for metric in example_column_metrics: - assert metric in values +def test_metadata_columns_to_perfdata( + rajaperf_cuda_block128_1M_cali, rajaperf_seq_O3_1M_cali +): + tk = Thicket.from_caliperreader( + [rajaperf_cuda_block128_1M_cali[0], rajaperf_seq_O3_1M_cali[0]], + disable_tqdm=True, + ) + tkc1 = tk.deepcopy() + + tk.metadata_columns_to_perfdata(["variant", "tuning"]) + + # Check columns added + assert "variant" in tk.dataframe.columns and "tuning" in tk.dataframe.columns + + # Check overwrite warning raised + with pytest.warns(UserWarning, match=r"Column .* already exists"): + tk.metadata_columns_to_perfdata(["variant", "tuning"]) + + # Check drop works + tkc2 = tk.deepcopy() + tkc2.metadata_columns_to_perfdata("variant", overwrite=True, drop=True) + assert "variant" not in tkc2.metadata + + # Check error raise for join_key + tkc2.dataframe = tkc2.dataframe.reset_index(level="profile", drop=True) + with pytest.raises(KeyError, match="'profile' must be present"): + tkc2.metadata_columns_to_perfdata("tuning", overwrite=True) + + # Check alternate join key + tk.metadata_columns_to_perfdata("ProblemSizeRunParam") + tk.metadata_columns_to_perfdata("user", join_key="ProblemSizeRunParam") + assert "user" in tk.dataframe + + # Check column axis Thicket + # 1. without metadata_key + gb = tkc1.groupby(["variant", "tuning"]) + ctk = Thicket.concat_thickets( + thickets=list(gb.values()), + axis="columns", + headers=list(gb.keys()), + ) + ctk.metadata_columns_to_perfdata( + metadata_keys=[(("Base_CUDA", "block_128"), "ProblemSizeRunParam")] + ) + assert (("Base_CUDA", "block_128"), "ProblemSizeRunParam") in ctk.dataframe.columns + # 2. with metadata_key + ctk2 = Thicket.concat_thickets( + thickets=list(gb.values()), + axis="columns", + headers=list(gb.keys()), + metadata_key="ProblemSizeRunParam", + ) + ctk2.metadata_columns_to_perfdata( + metadata_keys=[(("Base_CUDA", "block_128"), "user")], + join_key="ProblemSizeRunParam", + ) + assert (("Base_CUDA", "block_128"), "user") in ctk2.dataframe.columns def test_perfdata_column_to_statsframe(literal_thickets, mpi_scaling_cali): diff --git a/thicket/tests/test_tree.py b/thicket/tests/test_tree.py index f10cf167..298a28b2 100644 --- a/thicket/tests/test_tree.py +++ b/thicket/tests/test_tree.py @@ -14,8 +14,7 @@ def test_indices(rajaperf_unique_tunings): # No error tk.tree(metric_column="Avg time/rank", indices=tk.profile[0]) - tk.metadata_column_to_perfdata("variant") - tk.metadata_column_to_perfdata("tuning") + tk.metadata_columns_to_perfdata(["variant", "tuning"]) # Error because there are duplicate variants. We need to add the tuning to the index as well. tk.dataframe = ( diff --git a/thicket/thicket.py b/thicket/thicket.py index 32f2c2d2..ce53cab8 100644 --- a/thicket/thicket.py +++ b/thicket/thicket.py @@ -617,35 +617,50 @@ def _rep_agg_func(col): rsuffix="_right", ) - def metadata_column_to_perfdata(self, metadata_key, overwrite=False, drop=False): - """Add a column from the metadata table to the performance data table. + def metadata_columns_to_perfdata( + self, metadata_keys, overwrite=False, drop=False, join_key="profile" + ): + """Add columns from the metadata table to the performance data table. Joins on join_key, an index or column that is present in both tables. Arguments: - metadata_key (str): Name of the column from the metadata table + metadata_keys (list or str): List of the columns from the metadata table overwrite (bool): Determines overriding behavior in performance data table - drop (bool): Whether to drop the column from the metadata table afterwards + drop (bool): Whether to drop the columns from the metadata table afterwards + join_key (str): Name of the index/column to join on if not 'profile' """ + # Raise error if join_key is not present in both tables + if not ( + join_key in self.dataframe.reset_index() + and join_key in self.metadata.reset_index() + ): + raise KeyError( + f"'{join_key}' must be present (index or columns) for both the performance data table and metadata table." + ) + + # Convert metadata_keys to list if str + if isinstance(metadata_keys, str): + metadata_keys = [metadata_keys] + # Add warning if column already exists in performance data table - if metadata_key in self.dataframe.columns: - # Drop column to overwrite, otherwise warn and return - if overwrite: - self.dataframe.drop(metadata_key, axis=1, inplace=True) - else: - warnings.warn( - "Column " - + metadata_key - + " already exists. Set 'overwrite=True' to force update the column." - ) - return + for mkey in metadata_keys: + if mkey in self.dataframe.columns: + # Drop column to overwrite, otherwise warn and return + if overwrite: + self.dataframe.drop(mkey, axis=1, inplace=True) + else: + warnings.warn( + "Column " + + mkey + + " already exists. Set 'overwrite=True' to force update the column." + ) + return # Add the column to the performance data table - self.dataframe = self.dataframe.join( - self.metadata[metadata_key], on=self.dataframe.index.names[1] - ) + self.dataframe = self.dataframe.join(self.metadata[metadata_keys], on=join_key) # Drop column if drop: - self.metadata.drop(metadata_key, axis=1, inplace=True) + self.metadata.drop(metadata_keys, axis=1, inplace=True) def squash(self, update_inc_cols=True, new_statsframe=True): """Rewrite the Graph to include only nodes present in the performance From 67d13b56ff2e1276243d0a2c12a72e2072fa2485 Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Tue, 22 Oct 2024 16:06:07 -0500 Subject: [PATCH 2/3] Rename argument for clarity --- thicket/tests/test_thicket.py | 4 ++-- thicket/thicket.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/thicket/tests/test_thicket.py b/thicket/tests/test_thicket.py index c217955c..52a46736 100644 --- a/thicket/tests/test_thicket.py +++ b/thicket/tests/test_thicket.py @@ -122,7 +122,7 @@ def test_metadata_columns_to_perfdata( headers=list(gb.keys()), ) ctk.metadata_columns_to_perfdata( - metadata_keys=[(("Base_CUDA", "block_128"), "ProblemSizeRunParam")] + metadata_columns=[(("Base_CUDA", "block_128"), "ProblemSizeRunParam")] ) assert (("Base_CUDA", "block_128"), "ProblemSizeRunParam") in ctk.dataframe.columns # 2. with metadata_key @@ -133,7 +133,7 @@ def test_metadata_columns_to_perfdata( metadata_key="ProblemSizeRunParam", ) ctk2.metadata_columns_to_perfdata( - metadata_keys=[(("Base_CUDA", "block_128"), "user")], + metadata_columns=[(("Base_CUDA", "block_128"), "user")], join_key="ProblemSizeRunParam", ) assert (("Base_CUDA", "block_128"), "user") in ctk2.dataframe.columns diff --git a/thicket/thicket.py b/thicket/thicket.py index ce53cab8..22ab4806 100644 --- a/thicket/thicket.py +++ b/thicket/thicket.py @@ -618,12 +618,12 @@ def _rep_agg_func(col): ) def metadata_columns_to_perfdata( - self, metadata_keys, overwrite=False, drop=False, join_key="profile" + self, metadata_columns, overwrite=False, drop=False, join_key="profile" ): """Add columns from the metadata table to the performance data table. Joins on join_key, an index or column that is present in both tables. Arguments: - metadata_keys (list or str): List of the columns from the metadata table + metadata_columns (list or str): List of the columns from the metadata table overwrite (bool): Determines overriding behavior in performance data table drop (bool): Whether to drop the columns from the metadata table afterwards join_key (str): Name of the index/column to join on if not 'profile' @@ -637,12 +637,12 @@ def metadata_columns_to_perfdata( f"'{join_key}' must be present (index or columns) for both the performance data table and metadata table." ) - # Convert metadata_keys to list if str - if isinstance(metadata_keys, str): - metadata_keys = [metadata_keys] + # Convert metadata_columns to list if str + if isinstance(metadata_columns, str): + metadata_columns = [metadata_columns] # Add warning if column already exists in performance data table - for mkey in metadata_keys: + for mkey in metadata_columns: if mkey in self.dataframe.columns: # Drop column to overwrite, otherwise warn and return if overwrite: @@ -656,11 +656,11 @@ def metadata_columns_to_perfdata( return # Add the column to the performance data table - self.dataframe = self.dataframe.join(self.metadata[metadata_keys], on=join_key) + self.dataframe = self.dataframe.join(self.metadata[metadata_columns], on=join_key) # Drop column if drop: - self.metadata.drop(metadata_keys, axis=1, inplace=True) + self.metadata.drop(metadata_columns, axis=1, inplace=True) def squash(self, update_inc_cols=True, new_statsframe=True): """Rewrite the Graph to include only nodes present in the performance From 5cb40ae9e44bb75884587ae1246c84df81a39f27 Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Tue, 22 Oct 2024 16:08:40 -0500 Subject: [PATCH 3/3] black --- thicket/thicket.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/thicket/thicket.py b/thicket/thicket.py index 22ab4806..086060a3 100644 --- a/thicket/thicket.py +++ b/thicket/thicket.py @@ -656,7 +656,9 @@ def metadata_columns_to_perfdata( return # Add the column to the performance data table - self.dataframe = self.dataframe.join(self.metadata[metadata_columns], on=join_key) + self.dataframe = self.dataframe.join( + self.metadata[metadata_columns], on=join_key + ) # Drop column if drop: