From 37049b54f47a2f96447974f74432e06329fa05fd Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Tue, 22 Oct 2024 15:57:07 -0500
Subject: [PATCH 1/3] Improve metadata_column_to_perfdata to add multiple
 columns at the same time

---
 thicket/ensemble.py           |  4 +-
 thicket/groupby.py            |  2 +-
 thicket/tests/test_thicket.py | 79 +++++++++++++++++++++++++----------
 thicket/tests/test_tree.py    |  3 +-
 thicket/thicket.py            | 53 ++++++++++++++---------
 5 files changed, 95 insertions(+), 46 deletions(-)

diff --git a/thicket/ensemble.py b/thicket/ensemble.py
index a3960569..f0f77bd1 100644
--- a/thicket/ensemble.py
+++ b/thicket/ensemble.py
@@ -210,7 +210,7 @@ def _handle_perfdata():
                 new_profiles = [i for i in range(len(thickets_cp[0].profile))]
                 for i in range(len(thickets_cp)):
                     thickets_cp[i].metadata["new_profiles"] = new_profiles
-                    thickets_cp[i].metadata_column_to_perfdata(
+                    thickets_cp[i].metadata_columns_to_perfdata(
                         "new_profiles", drop=True
                     )
                     thickets_cp[i].dataframe.reset_index(level=inner_idx, inplace=True)
@@ -233,7 +233,7 @@ def _handle_perfdata():
             else:  # Change second-level index to be from metadata's "metadata_key" column
                 for i in range(len(thickets_cp)):
                     if metadata_key not in thickets_cp[i].dataframe.index.names:
-                        thickets_cp[i].metadata_column_to_perfdata(metadata_key)
+                        thickets_cp[i].metadata_columns_to_perfdata(metadata_key)
                     thickets_cp[i].dataframe.reset_index(level=inner_idx, inplace=True)
                     new_mappings.update(
                         pd.Series(
diff --git a/thicket/groupby.py b/thicket/groupby.py
index 56b3c1ed..a90540d7 100644
--- a/thicket/groupby.py
+++ b/thicket/groupby.py
@@ -83,7 +83,7 @@ def _agg_rows(col_series):
             if col not in index_names:
                 if col in tk_c.metadata.columns or col in df_columns:
                     if col not in df_columns:
-                        tk_c.metadata_column_to_perfdata(col)
+                        tk_c.metadata_columns_to_perfdata(col)
                     tk_c.dataframe = tk_c.dataframe.set_index(col, append=True)
                 else:
                     raise KeyError(f'"{col}" is not in the PerfData or MetaData.')
diff --git a/thicket/tests/test_thicket.py b/thicket/tests/test_thicket.py
index 4a2ff5b8..c217955c 100644
--- a/thicket/tests/test_thicket.py
+++ b/thicket/tests/test_thicket.py
@@ -80,28 +80,63 @@ def _test_multiindex():
     assert bool(re.search("1.000.*Basic_COPY8", tree_output))
 
 
-def test_metadata_column_to_perfdata(mpi_scaling_cali):
-    t_ens = Thicket.from_caliperreader(mpi_scaling_cali, disable_tqdm=True)
-
-    example_column = "jobsize"
-    example_column_metrics = [27, 64, 125, 216, 343]
-
-    # Column should be in metadata table
-    assert example_column in t_ens.metadata
-    # Column should not be in performance data table
-    assert example_column not in t_ens.dataframe
-    # Assume second level index is profile
-    assert t_ens.dataframe.index.names[1] == "profile"
-
-    t_ens.metadata_column_to_perfdata(example_column)
-
-    # Column should be in performance data table
-    assert example_column in t_ens.dataframe
-
-    # Check that the metrics exist in the performance data table
-    values = t_ens.dataframe[example_column].values.astype("int")
-    for metric in example_column_metrics:
-        assert metric in values
+def test_metadata_columns_to_perfdata(
+    rajaperf_cuda_block128_1M_cali, rajaperf_seq_O3_1M_cali
+):
+    tk = Thicket.from_caliperreader(
+        [rajaperf_cuda_block128_1M_cali[0], rajaperf_seq_O3_1M_cali[0]],
+        disable_tqdm=True,
+    )
+    tkc1 = tk.deepcopy()
+
+    tk.metadata_columns_to_perfdata(["variant", "tuning"])
+
+    # Check columns added
+    assert "variant" in tk.dataframe.columns and "tuning" in tk.dataframe.columns
+
+    # Check overwrite warning raised
+    with pytest.warns(UserWarning, match=r"Column .* already exists"):
+        tk.metadata_columns_to_perfdata(["variant", "tuning"])
+
+    # Check drop works
+    tkc2 = tk.deepcopy()
+    tkc2.metadata_columns_to_perfdata("variant", overwrite=True, drop=True)
+    assert "variant" not in tkc2.metadata
+
+    # Check error raise for join_key
+    tkc2.dataframe = tkc2.dataframe.reset_index(level="profile", drop=True)
+    with pytest.raises(KeyError, match="'profile' must be present"):
+        tkc2.metadata_columns_to_perfdata("tuning", overwrite=True)
+
+    # Check alternate join key
+    tk.metadata_columns_to_perfdata("ProblemSizeRunParam")
+    tk.metadata_columns_to_perfdata("user", join_key="ProblemSizeRunParam")
+    assert "user" in tk.dataframe
+
+    # Check column axis Thicket
+    # 1. without metadata_key
+    gb = tkc1.groupby(["variant", "tuning"])
+    ctk = Thicket.concat_thickets(
+        thickets=list(gb.values()),
+        axis="columns",
+        headers=list(gb.keys()),
+    )
+    ctk.metadata_columns_to_perfdata(
+        metadata_keys=[(("Base_CUDA", "block_128"), "ProblemSizeRunParam")]
+    )
+    assert (("Base_CUDA", "block_128"), "ProblemSizeRunParam") in ctk.dataframe.columns
+    # 2. with metadata_key
+    ctk2 = Thicket.concat_thickets(
+        thickets=list(gb.values()),
+        axis="columns",
+        headers=list(gb.keys()),
+        metadata_key="ProblemSizeRunParam",
+    )
+    ctk2.metadata_columns_to_perfdata(
+        metadata_keys=[(("Base_CUDA", "block_128"), "user")],
+        join_key="ProblemSizeRunParam",
+    )
+    assert (("Base_CUDA", "block_128"), "user") in ctk2.dataframe.columns
 
 
 def test_perfdata_column_to_statsframe(literal_thickets, mpi_scaling_cali):
diff --git a/thicket/tests/test_tree.py b/thicket/tests/test_tree.py
index f10cf167..298a28b2 100644
--- a/thicket/tests/test_tree.py
+++ b/thicket/tests/test_tree.py
@@ -14,8 +14,7 @@ def test_indices(rajaperf_unique_tunings):
     # No error
     tk.tree(metric_column="Avg time/rank", indices=tk.profile[0])
 
-    tk.metadata_column_to_perfdata("variant")
-    tk.metadata_column_to_perfdata("tuning")
+    tk.metadata_columns_to_perfdata(["variant", "tuning"])
 
     # Error because there are duplicate variants. We need to add the tuning to the index as well.
     tk.dataframe = (
diff --git a/thicket/thicket.py b/thicket/thicket.py
index 32f2c2d2..ce53cab8 100644
--- a/thicket/thicket.py
+++ b/thicket/thicket.py
@@ -617,35 +617,50 @@ def _rep_agg_func(col):
             rsuffix="_right",
         )
 
-    def metadata_column_to_perfdata(self, metadata_key, overwrite=False, drop=False):
-        """Add a column from the metadata table to the performance data table.
+    def metadata_columns_to_perfdata(
+        self, metadata_keys, overwrite=False, drop=False, join_key="profile"
+    ):
+        """Add columns from the metadata table to the performance data table. Joins on join_key, an index or column that is present in both tables.
 
         Arguments:
-            metadata_key (str): Name of the column from the metadata table
+            metadata_keys (list or str): List of the columns from the metadata table
             overwrite (bool): Determines overriding behavior in performance data table
-            drop (bool): Whether to drop the column from the metadata table afterwards
+            drop (bool): Whether to drop the columns from the metadata table afterwards
+            join_key (str): Name of the index/column to join on if not 'profile'
         """
+        # Raise error if join_key is not present in both tables
+        if not (
+            join_key in self.dataframe.reset_index()
+            and join_key in self.metadata.reset_index()
+        ):
+            raise KeyError(
+                f"'{join_key}' must be present (index or columns) for both the performance data table and metadata table."
+            )
+
+        # Convert metadata_keys to list if str
+        if isinstance(metadata_keys, str):
+            metadata_keys = [metadata_keys]
+
         # Add warning if column already exists in performance data table
-        if metadata_key in self.dataframe.columns:
-            # Drop column to overwrite, otherwise warn and return
-            if overwrite:
-                self.dataframe.drop(metadata_key, axis=1, inplace=True)
-            else:
-                warnings.warn(
-                    "Column "
-                    + metadata_key
-                    + " already exists. Set 'overwrite=True' to force update the column."
-                )
-                return
+        for mkey in metadata_keys:
+            if mkey in self.dataframe.columns:
+                # Drop column to overwrite, otherwise warn and return
+                if overwrite:
+                    self.dataframe.drop(mkey, axis=1, inplace=True)
+                else:
+                    warnings.warn(
+                        "Column "
+                        + mkey
+                        + " already exists. Set 'overwrite=True' to force update the column."
+                    )
+                    return
 
         # Add the column to the performance data table
-        self.dataframe = self.dataframe.join(
-            self.metadata[metadata_key], on=self.dataframe.index.names[1]
-        )
+        self.dataframe = self.dataframe.join(self.metadata[metadata_keys], on=join_key)
 
         # Drop column
         if drop:
-            self.metadata.drop(metadata_key, axis=1, inplace=True)
+            self.metadata.drop(metadata_keys, axis=1, inplace=True)
 
     def squash(self, update_inc_cols=True, new_statsframe=True):
         """Rewrite the Graph to include only nodes present in the performance

From 67d13b56ff2e1276243d0a2c12a72e2072fa2485 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Tue, 22 Oct 2024 16:06:07 -0500
Subject: [PATCH 2/3] Rename argument for clarity

---
 thicket/tests/test_thicket.py |  4 ++--
 thicket/thicket.py            | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/thicket/tests/test_thicket.py b/thicket/tests/test_thicket.py
index c217955c..52a46736 100644
--- a/thicket/tests/test_thicket.py
+++ b/thicket/tests/test_thicket.py
@@ -122,7 +122,7 @@ def test_metadata_columns_to_perfdata(
         headers=list(gb.keys()),
     )
     ctk.metadata_columns_to_perfdata(
-        metadata_keys=[(("Base_CUDA", "block_128"), "ProblemSizeRunParam")]
+        metadata_columns=[(("Base_CUDA", "block_128"), "ProblemSizeRunParam")]
     )
     assert (("Base_CUDA", "block_128"), "ProblemSizeRunParam") in ctk.dataframe.columns
     # 2. with metadata_key
@@ -133,7 +133,7 @@ def test_metadata_columns_to_perfdata(
         metadata_key="ProblemSizeRunParam",
     )
     ctk2.metadata_columns_to_perfdata(
-        metadata_keys=[(("Base_CUDA", "block_128"), "user")],
+        metadata_columns=[(("Base_CUDA", "block_128"), "user")],
         join_key="ProblemSizeRunParam",
     )
     assert (("Base_CUDA", "block_128"), "user") in ctk2.dataframe.columns
diff --git a/thicket/thicket.py b/thicket/thicket.py
index ce53cab8..22ab4806 100644
--- a/thicket/thicket.py
+++ b/thicket/thicket.py
@@ -618,12 +618,12 @@ def _rep_agg_func(col):
         )
 
     def metadata_columns_to_perfdata(
-        self, metadata_keys, overwrite=False, drop=False, join_key="profile"
+        self, metadata_columns, overwrite=False, drop=False, join_key="profile"
     ):
         """Add columns from the metadata table to the performance data table. Joins on join_key, an index or column that is present in both tables.
 
         Arguments:
-            metadata_keys (list or str): List of the columns from the metadata table
+            metadata_columns (list or str): List of the columns from the metadata table
             overwrite (bool): Determines overriding behavior in performance data table
             drop (bool): Whether to drop the columns from the metadata table afterwards
             join_key (str): Name of the index/column to join on if not 'profile'
@@ -637,12 +637,12 @@ def metadata_columns_to_perfdata(
                 f"'{join_key}' must be present (index or columns) for both the performance data table and metadata table."
             )
 
-        # Convert metadata_keys to list if str
-        if isinstance(metadata_keys, str):
-            metadata_keys = [metadata_keys]
+        # Convert metadata_columns to list if str
+        if isinstance(metadata_columns, str):
+            metadata_columns = [metadata_columns]
 
         # Add warning if column already exists in performance data table
-        for mkey in metadata_keys:
+        for mkey in metadata_columns:
             if mkey in self.dataframe.columns:
                 # Drop column to overwrite, otherwise warn and return
                 if overwrite:
@@ -656,11 +656,11 @@ def metadata_columns_to_perfdata(
                     return
 
         # Add the column to the performance data table
-        self.dataframe = self.dataframe.join(self.metadata[metadata_keys], on=join_key)
+        self.dataframe = self.dataframe.join(self.metadata[metadata_columns], on=join_key)
 
         # Drop column
         if drop:
-            self.metadata.drop(metadata_keys, axis=1, inplace=True)
+            self.metadata.drop(metadata_columns, axis=1, inplace=True)
 
     def squash(self, update_inc_cols=True, new_statsframe=True):
         """Rewrite the Graph to include only nodes present in the performance

From 5cb40ae9e44bb75884587ae1246c84df81a39f27 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Tue, 22 Oct 2024 16:08:40 -0500
Subject: [PATCH 3/3] black

---
 thicket/thicket.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/thicket/thicket.py b/thicket/thicket.py
index 22ab4806..086060a3 100644
--- a/thicket/thicket.py
+++ b/thicket/thicket.py
@@ -656,7 +656,9 @@ def metadata_columns_to_perfdata(
                     return
 
         # Add the column to the performance data table
-        self.dataframe = self.dataframe.join(self.metadata[metadata_columns], on=join_key)
+        self.dataframe = self.dataframe.join(
+            self.metadata[metadata_columns], on=join_key
+        )
 
         # Drop column
         if drop: