Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend metadata_column_to_perfdata to Multiple Columns #216

Merged
merged 3 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions thicket/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def _handle_perfdata():
new_profiles = [i for i in range(len(thickets_cp[0].profile))]
for i in range(len(thickets_cp)):
thickets_cp[i].metadata["new_profiles"] = new_profiles
thickets_cp[i].metadata_column_to_perfdata(
thickets_cp[i].metadata_columns_to_perfdata(
"new_profiles", drop=True
)
thickets_cp[i].dataframe.reset_index(level=inner_idx, inplace=True)
Expand All @@ -233,7 +233,7 @@ def _handle_perfdata():
else: # Change second-level index to be from metadata's "metadata_key" column
for i in range(len(thickets_cp)):
if metadata_key not in thickets_cp[i].dataframe.index.names:
thickets_cp[i].metadata_column_to_perfdata(metadata_key)
thickets_cp[i].metadata_columns_to_perfdata(metadata_key)
thickets_cp[i].dataframe.reset_index(level=inner_idx, inplace=True)
new_mappings.update(
pd.Series(
Expand Down
2 changes: 1 addition & 1 deletion thicket/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def _agg_rows(col_series):
if col not in index_names:
if col in tk_c.metadata.columns or col in df_columns:
if col not in df_columns:
tk_c.metadata_column_to_perfdata(col)
tk_c.metadata_columns_to_perfdata(col)
tk_c.dataframe = tk_c.dataframe.set_index(col, append=True)
else:
raise KeyError(f'"{col}" is not in the PerfData or MetaData.')
Expand Down
79 changes: 57 additions & 22 deletions thicket/tests/test_thicket.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,28 +80,63 @@ def _test_multiindex():
assert bool(re.search("1.000.*Basic_COPY8", tree_output))


def test_metadata_column_to_perfdata(mpi_scaling_cali):
t_ens = Thicket.from_caliperreader(mpi_scaling_cali, disable_tqdm=True)

example_column = "jobsize"
example_column_metrics = [27, 64, 125, 216, 343]

# Column should be in metadata table
assert example_column in t_ens.metadata
# Column should not be in performance data table
assert example_column not in t_ens.dataframe
# Assume second level index is profile
assert t_ens.dataframe.index.names[1] == "profile"

t_ens.metadata_column_to_perfdata(example_column)

# Column should be in performance data table
assert example_column in t_ens.dataframe

# Check that the metrics exist in the performance data table
values = t_ens.dataframe[example_column].values.astype("int")
for metric in example_column_metrics:
assert metric in values
def test_metadata_columns_to_perfdata(
rajaperf_cuda_block128_1M_cali, rajaperf_seq_O3_1M_cali
):
tk = Thicket.from_caliperreader(
[rajaperf_cuda_block128_1M_cali[0], rajaperf_seq_O3_1M_cali[0]],
disable_tqdm=True,
)
tkc1 = tk.deepcopy()

tk.metadata_columns_to_perfdata(["variant", "tuning"])

# Check columns added
assert "variant" in tk.dataframe.columns and "tuning" in tk.dataframe.columns

# Check overwrite warning raised
with pytest.warns(UserWarning, match=r"Column .* already exists"):
tk.metadata_columns_to_perfdata(["variant", "tuning"])

# Check drop works
tkc2 = tk.deepcopy()
tkc2.metadata_columns_to_perfdata("variant", overwrite=True, drop=True)
assert "variant" not in tkc2.metadata

# Check error raise for join_key
tkc2.dataframe = tkc2.dataframe.reset_index(level="profile", drop=True)
with pytest.raises(KeyError, match="'profile' must be present"):
tkc2.metadata_columns_to_perfdata("tuning", overwrite=True)

# Check alternate join key
tk.metadata_columns_to_perfdata("ProblemSizeRunParam")
tk.metadata_columns_to_perfdata("user", join_key="ProblemSizeRunParam")
assert "user" in tk.dataframe

# Check column axis Thicket
# 1. without metadata_key
gb = tkc1.groupby(["variant", "tuning"])
ctk = Thicket.concat_thickets(
thickets=list(gb.values()),
axis="columns",
headers=list(gb.keys()),
)
ctk.metadata_columns_to_perfdata(
metadata_columns=[(("Base_CUDA", "block_128"), "ProblemSizeRunParam")]
)
assert (("Base_CUDA", "block_128"), "ProblemSizeRunParam") in ctk.dataframe.columns
# 2. with metadata_key
ctk2 = Thicket.concat_thickets(
thickets=list(gb.values()),
axis="columns",
headers=list(gb.keys()),
metadata_key="ProblemSizeRunParam",
)
ctk2.metadata_columns_to_perfdata(
metadata_columns=[(("Base_CUDA", "block_128"), "user")],
join_key="ProblemSizeRunParam",
)
assert (("Base_CUDA", "block_128"), "user") in ctk2.dataframe.columns


def test_perfdata_column_to_statsframe(literal_thickets, mpi_scaling_cali):
Expand Down
3 changes: 1 addition & 2 deletions thicket/tests/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ def test_indices(rajaperf_unique_tunings):
# No error
tk.tree(metric_column="Avg time/rank", indices=tk.profile[0])

tk.metadata_column_to_perfdata("variant")
tk.metadata_column_to_perfdata("tuning")
tk.metadata_columns_to_perfdata(["variant", "tuning"])

# Error because there are duplicate variants. We need to add the tuning to the index as well.
tk.dataframe = (
Expand Down
51 changes: 34 additions & 17 deletions thicket/thicket.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,35 +617,52 @@ def _rep_agg_func(col):
rsuffix="_right",
)

def metadata_column_to_perfdata(self, metadata_key, overwrite=False, drop=False):
"""Add a column from the metadata table to the performance data table.
def metadata_columns_to_perfdata(
self, metadata_columns, overwrite=False, drop=False, join_key="profile"
):
"""Add columns from the metadata table to the performance data table. Joins on join_key, an index or column that is present in both tables.

Arguments:
metadata_key (str): Name of the column from the metadata table
metadata_columns (list or str): List of the columns from the metadata table
overwrite (bool): Determines overriding behavior in performance data table
drop (bool): Whether to drop the column from the metadata table afterwards
drop (bool): Whether to drop the columns from the metadata table afterwards
join_key (str): Name of the index/column to join on if not 'profile'
"""
# Raise error if join_key is not present in both tables
if not (
join_key in self.dataframe.reset_index()
and join_key in self.metadata.reset_index()
):
raise KeyError(
f"'{join_key}' must be present (index or columns) for both the performance data table and metadata table."
)

# Convert metadata_columns to list if str
if isinstance(metadata_columns, str):
metadata_columns = [metadata_columns]

# Add warning if column already exists in performance data table
if metadata_key in self.dataframe.columns:
# Drop column to overwrite, otherwise warn and return
if overwrite:
self.dataframe.drop(metadata_key, axis=1, inplace=True)
else:
warnings.warn(
"Column "
+ metadata_key
+ " already exists. Set 'overwrite=True' to force update the column."
)
return
for mkey in metadata_columns:
if mkey in self.dataframe.columns:
# Drop column to overwrite, otherwise warn and return
if overwrite:
self.dataframe.drop(mkey, axis=1, inplace=True)
else:
warnings.warn(
"Column "
+ mkey
+ " already exists. Set 'overwrite=True' to force update the column."
)
return

# Add the column to the performance data table
self.dataframe = self.dataframe.join(
self.metadata[metadata_key], on=self.dataframe.index.names[1]
self.metadata[metadata_columns], on=join_key
)

# Drop column
if drop:
self.metadata.drop(metadata_key, axis=1, inplace=True)
self.metadata.drop(metadata_columns, axis=1, inplace=True)

def squash(self, update_inc_cols=True, new_statsframe=True):
"""Rewrite the Graph to include only nodes present in the performance
Expand Down
Loading