From b3792012669f8b01aa707d542ac0a0a504a09343 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Mon, 29 Jul 2024 13:03:46 -0700 Subject: [PATCH 01/19] Add error check --- thicket/ncu.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/thicket/ncu.py b/thicket/ncu.py index 0e8440c5..573899bb 100644 --- a/thicket/ncu.py +++ b/thicket/ncu.py @@ -82,6 +82,9 @@ def _read_ncu(thicket, ncu_report_mapping): # Loop through NCU files for ncu_report_file in ncu_report_mapping: + # Set error check flag + call_trace_found = False + # NCU hash profile_mapping_flipped = {v: k for k, v in thicket.profile_mapping.items()} ncu_hash = profile_mapping_flipped[ncu_report_mapping[ncu_report_file]] @@ -124,6 +127,7 @@ def _read_ncu(thicket, ncu_report_mapping): if len(kernel_call_trace) == 0: continue else: + call_trace_found=True # Add kernel name to the end of the trace tuple kernel_call_trace.append(kernel_name) @@ -155,4 +159,7 @@ def _read_ncu(thicket, ncu_report_mapping): dict(zip(metric_names, metric_values)) ) + if not call_trace_found: + raise ValueError(f"No kernel call traces found in {ncu_report_file}.\nCheck you are enabling the NVTX Caliper service when running NCU.") + return data_dict, rollup_dict From f3e76d592ec0736f1e0b70e62f3f24b290a8eab9 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Mon, 29 Jul 2024 13:06:09 -0700 Subject: [PATCH 02/19] Add error check --- thicket/ncu.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/thicket/ncu.py b/thicket/ncu.py index 573899bb..1fbef15b 100644 --- a/thicket/ncu.py +++ b/thicket/ncu.py @@ -93,12 +93,19 @@ def _read_ncu(thicket, ncu_report_mapping): report = ncu_report.load_report(ncu_report_file) # Error check - if report.num_ranges() > 1: + num_ranges = report.num_ranges() + if num_ranges > 1: raise ValueError( "NCU report file " + ncu_report_file + " has multiple ranges. Not supported yet." ) + elif num_ranges == 0: + raise ValueError( + "NCU report file " + + ncu_report_file + + " has no ranges (no data)." + ) # Loop through ranges in report for range in report: # Grab first action From 2c2016dff650a3a62baa27cf66d2376eb7b1eace Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Mon, 29 Jul 2024 15:51:27 -0700 Subject: [PATCH 03/19] Enable matching RAJA_CUDA data using demangled string --- thicket/ncu.py | 80 +++++++++++++++++++++++++++++++++++++--------- thicket/thicket.py | 4 +-- 2 files changed, 67 insertions(+), 17 deletions(-) diff --git a/thicket/ncu.py b/thicket/ncu.py index 1fbef15b..8c382dc2 100644 --- a/thicket/ncu.py +++ b/thicket/ncu.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: MIT from collections import defaultdict +import re from hatchet import QueryMatcher import pandas as pd @@ -63,7 +64,7 @@ def _predicate_builder(kernel, is_regex=False): return query @staticmethod - def _read_ncu(thicket, ncu_report_mapping): + def _read_ncu(thicket, ncu_report_mapping, debug=False): """Read NCU report files and return dictionary of data. Arguments: @@ -89,6 +90,10 @@ def _read_ncu(thicket, ncu_report_mapping): profile_mapping_flipped = {v: k for k, v in thicket.profile_mapping.items()} ncu_hash = profile_mapping_flipped[ncu_report_mapping[ncu_report_file]] + # Relevant for kernel matching + variant = thicket.metadata.loc[ncu_hash, "variant"] + raja_cuda = variant.upper() == "RAJA_CUDA" + # Load file report = ncu_report.load_report(ncu_report_file) @@ -102,9 +107,7 @@ def _read_ncu(thicket, ncu_report_mapping): ) elif num_ranges == 0: raise ValueError( - "NCU report file " - + ncu_report_file - + " has no ranges (no data)." + "NCU report file " + ncu_report_file + " has no ranges (no data)." ) # Loop through ranges in report for range in report: @@ -123,8 +126,10 @@ def _read_ncu(thicket, ncu_report_mapping): pbar = tqdm(range) for i, action in enumerate(pbar): pbar.set_description(f"Processing action {i}/{len(range)}") - # Name of kernel - kernel_name = action.name() + # Demangled name of kernel + demangled_kernel_name = action.name( + ncu_report.IAction.NameBase_DEMANGLED + ) # Get NCU-side kernel trace kernel_call_trace = list( action.nvtx_state().domain_by_id(0).push_pop_ranges() @@ -134,9 +139,32 @@ def _read_ncu(thicket, ncu_report_mapping): if len(kernel_call_trace) == 0: continue else: - call_trace_found=True + call_trace_found = True + + # Call trace with last element removed + # (last elem usually not useful for matching) + temp_call_trace = kernel_call_trace[:-1] + call_trace_str = "::".join([s.lower() for s in temp_call_trace]) + + # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA + kernel_pattern = rf"{call_trace_str}::(\w+)[<:]" + kernel_match = re.search(kernel_pattern, demangled_kernel_name) + kernel_str = kernel_match.group(1) + + if raja_cuda: + # RAJA_CUDA variant + instance_pattern = r"instance (\d+)" + instance_match = re.findall( + instance_pattern, demangled_kernel_name + ) + instance_num = instance_match[-1] + kernel_name = kernel_str + "_" + instance_num + else: + # Base_CUDA variant + kernel_name = kernel_str + # Add kernel name to the end of the trace tuple - kernel_call_trace.append(kernel_name) + kernel_call_trace.append(kernel_str) # Match ncu kernel to thicket node matched_node = None @@ -150,23 +178,45 @@ def _read_ncu(thicket, ncu_report_mapping): ) # Apply the query node_set = query.apply(thicket) - # Find the correct node - matched_node = [ - n for n in node_set if kernel_name in n.frame["name"] - ][0] + # Find the correct node. This may also get the parent so we take the last one + matched_nodes = [ + n + for n in node_set + if kernel_str in n.frame["name"] + and ( + f"#{instance_num}" in n.frame["name"] + if raja_cuda + else True + ) + ] + matched_node = matched_nodes[0] + + if debug: + if not raja_cuda: + instance_num = "NA" + print( + f"Matched NCU kernel:\n\t{demangled_kernel_name}\nto Caliper Node:\n\t{matched_node}" + ) + print( + f"AKA:\n\t{kernel_str} (instance {instance_num}) == {kernel_str} (#{instance_num})\n" + ) + print("All matched nodes:") + for node in matched_nodes: + print("\t", node) - # matched_node should always exist at this point - assert matched_node is not None # Set mapping kernel_map[kernel_name] = matched_node metric_values = [action[name].value() for name in metric_names] + assert len(metric_names) == len(metric_values) data_dict[(matched_node, ncu_hash)].append( dict(zip(metric_names, metric_values)) ) if not call_trace_found: - raise ValueError(f"No kernel call traces found in {ncu_report_file}.\nCheck you are enabling the NVTX Caliper service when running NCU.") + raise ValueError( + f"No kernel call traces found in {ncu_report_file}.\nCheck you are enabling the NVTX Caliper service when running NCU." + ) return data_dict, rollup_dict diff --git a/thicket/thicket.py b/thicket/thicket.py index 32f2c2d2..7f386fe6 100644 --- a/thicket/thicket.py +++ b/thicket/thicket.py @@ -554,7 +554,7 @@ def from_json(json_thicket): # make and return thicket? return th - def add_ncu(self, ncu_report_mapping, chosen_metrics=None, overwrite=False): + def add_ncu(self, ncu_report_mapping, chosen_metrics=None, overwrite=False, debug=False): """Add NCU data into the PerformanceDataFrame Arguments: @@ -589,7 +589,7 @@ def _rep_agg_func(col): ncureader = NCUReader() # Dictionary of NCU data - data_dict, rollup_dict = ncureader._read_ncu(self, ncu_report_mapping) + data_dict, rollup_dict = ncureader._read_ncu(self, ncu_report_mapping, debug) # Create empty df ncu_df = pd.DataFrame() From 8f76091a8362cf49547d4dfa5563d4907168c961 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Mon, 29 Jul 2024 16:04:05 -0700 Subject: [PATCH 04/19] black and docstring --- thicket/ncu.py | 1 + thicket/thicket.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/thicket/ncu.py b/thicket/ncu.py index 8c382dc2..12e4954d 100644 --- a/thicket/ncu.py +++ b/thicket/ncu.py @@ -70,6 +70,7 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): Arguments: thicket (Thicket): thicket object to add ncu metrics to ncu_report_mapping (dict): mapping from NCU report file to profile + debug (bool): whether to print debug statements Returns: data_dict (dict): dictionary of NCU data where key is tuple, (node, profile), mapping to list of dictionaries for per-rep data that is aggregated down to one dictionary. diff --git a/thicket/thicket.py b/thicket/thicket.py index 7f386fe6..4534b62a 100644 --- a/thicket/thicket.py +++ b/thicket/thicket.py @@ -554,13 +554,16 @@ def from_json(json_thicket): # make and return thicket? return th - def add_ncu(self, ncu_report_mapping, chosen_metrics=None, overwrite=False, debug=False): + def add_ncu( + self, ncu_report_mapping, chosen_metrics=None, overwrite=False, debug=False + ): """Add NCU data into the PerformanceDataFrame Arguments: ncu_report_mapping (dict): mapping from NCU report file to profile chosen_metrics (list): list of metrics to sub-select from NCU report overwrite (bool): whether to overwrite existing columns in the Thicket.DataFrame + debug (bool): whether to print debug information """ def _rep_agg_func(col): From b4cb38cc6a85dac2db8813c2f93f954c08d518fb Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Mon, 29 Jul 2024 16:06:41 -0700 Subject: [PATCH 05/19] black --- thicket/thicket.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/thicket/thicket.py b/thicket/thicket.py index 4534b62a..c616f0ad 100644 --- a/thicket/thicket.py +++ b/thicket/thicket.py @@ -555,8 +555,8 @@ def from_json(json_thicket): return th def add_ncu( - self, ncu_report_mapping, chosen_metrics=None, overwrite=False, debug=False - ): + self, ncu_report_mapping, chosen_metrics=None, overwrite=False, debug=False + ): """Add NCU data into the PerformanceDataFrame Arguments: From 14145bb198552043e86a1063669b1dad5519b866 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Mon, 29 Jul 2024 16:22:58 -0700 Subject: [PATCH 06/19] Add Lambda_CUDA support --- thicket/ncu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/thicket/ncu.py b/thicket/ncu.py index 12e4954d..6293a398 100644 --- a/thicket/ncu.py +++ b/thicket/ncu.py @@ -93,7 +93,7 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): # Relevant for kernel matching variant = thicket.metadata.loc[ncu_hash, "variant"] - raja_cuda = variant.upper() == "RAJA_CUDA" + raja_lambda_cuda = variant.upper() == "RAJA_CUDA" or variant.upper() == "LAMBDA_CUDA" # Load file report = ncu_report.load_report(ncu_report_file) @@ -152,7 +152,7 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): kernel_match = re.search(kernel_pattern, demangled_kernel_name) kernel_str = kernel_match.group(1) - if raja_cuda: + if raja_lambda_cuda: # RAJA_CUDA variant instance_pattern = r"instance (\d+)" instance_match = re.findall( @@ -186,14 +186,14 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): if kernel_str in n.frame["name"] and ( f"#{instance_num}" in n.frame["name"] - if raja_cuda + if raja_lambda_cuda else True ) ] matched_node = matched_nodes[0] if debug: - if not raja_cuda: + if not raja_lambda_cuda: instance_num = "NA" print( f"Matched NCU kernel:\n\t{demangled_kernel_name}\nto Caliper Node:\n\t{matched_node}" From 5ac8720146cdd160b38e59b2d4b1452a0f874ef9 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Mon, 29 Jul 2024 20:04:05 -0700 Subject: [PATCH 07/19] black --- thicket/ncu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/thicket/ncu.py b/thicket/ncu.py index 6293a398..ed23ae36 100644 --- a/thicket/ncu.py +++ b/thicket/ncu.py @@ -93,7 +93,9 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): # Relevant for kernel matching variant = thicket.metadata.loc[ncu_hash, "variant"] - raja_lambda_cuda = variant.upper() == "RAJA_CUDA" or variant.upper() == "LAMBDA_CUDA" + raja_lambda_cuda = ( + variant.upper() == "RAJA_CUDA" or variant.upper() == "LAMBDA_CUDA" + ) # Load file report = ncu_report.load_report(ncu_report_file) From aa3146fe6588a74fa20c03079c012bd19ef09bb6 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Tue, 30 Jul 2024 12:27:08 -0700 Subject: [PATCH 08/19] Update docstring --- thicket/thicket.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thicket/thicket.py b/thicket/thicket.py index c616f0ad..8f9f6c80 100644 --- a/thicket/thicket.py +++ b/thicket/thicket.py @@ -560,7 +560,7 @@ def add_ncu( """Add NCU data into the PerformanceDataFrame Arguments: - ncu_report_mapping (dict): mapping from NCU report file to profile + ncu_report_mapping (dict): mapping from NCU report file to Caliper CUDA Activity Profile chosen_metrics (list): list of metrics to sub-select from NCU report overwrite (bool): whether to overwrite existing columns in the Thicket.DataFrame debug (bool): whether to print debug information From caa6a3cabfcda815daa8d8e186825196480d9c66 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Thu, 3 Oct 2024 15:13:24 -0700 Subject: [PATCH 09/19] Skip matching if different pattern. Add additional debug statements --- thicket/ncu.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/thicket/ncu.py b/thicket/ncu.py index ed23ae36..1aa45247 100644 --- a/thicket/ncu.py +++ b/thicket/ncu.py @@ -129,6 +129,8 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): pbar = tqdm(range) for i, action in enumerate(pbar): pbar.set_description(f"Processing action {i}/{len(range)}") + if debug: + print(f"Action: {i}") # Demangled name of kernel demangled_kernel_name = action.name( ncu_report.IAction.NameBase_DEMANGLED @@ -148,14 +150,22 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): # (last elem usually not useful for matching) temp_call_trace = kernel_call_trace[:-1] call_trace_str = "::".join([s.lower() for s in temp_call_trace]) + if debug: + print(f"\tKernel Call Trace: {call_trace_str}") # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA kernel_pattern = rf"{call_trace_str}::(\w+)[<:]" kernel_match = re.search(kernel_pattern, demangled_kernel_name) - kernel_str = kernel_match.group(1) + # Found match + if kernel_match: + kernel_str = kernel_match.group(1) + else: + if debug: + print(f"\tCould not match {demangled_kernel_name}") + continue if raja_lambda_cuda: - # RAJA_CUDA variant + # RAJA_CUDA/Lambda_CUDA variant instance_pattern = r"instance (\d+)" instance_match = re.findall( instance_pattern, demangled_kernel_name @@ -198,12 +208,12 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): if not raja_lambda_cuda: instance_num = "NA" print( - f"Matched NCU kernel:\n\t{demangled_kernel_name}\nto Caliper Node:\n\t{matched_node}" + f"\tMatched NCU kernel:\n\t\t{demangled_kernel_name}\n\tto Caliper Node:\n\t\t{matched_node}" ) print( - f"AKA:\n\t{kernel_str} (instance {instance_num}) == {kernel_str} (#{instance_num})\n" + f"\tAKA:\n\t\t{kernel_str} (instance {instance_num}) == {kernel_str} (#{instance_num})\n" ) - print("All matched nodes:") + print("\tAll matched nodes:") for node in matched_nodes: print("\t", node) From dbe3e5806fa7c7ecdf9914977f6f55a62dcd17ce Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Thu, 3 Oct 2024 16:46:49 -0700 Subject: [PATCH 10/19] Partial cub support --- thicket/ncu.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/thicket/ncu.py b/thicket/ncu.py index 1aa45247..559b70ad 100644 --- a/thicket/ncu.py +++ b/thicket/ncu.py @@ -149,9 +149,16 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): # Call trace with last element removed # (last elem usually not useful for matching) temp_call_trace = kernel_call_trace[:-1] - call_trace_str = "::".join([s.lower() for s in temp_call_trace]) + # Special case to match "cub" kernels + if "cub" in demangled_kernel_name: + call_trace_str = "cub" + else: + call_trace_str = "::".join( + [s.lower() for s in temp_call_trace] + ) if debug: print(f"\tKernel Call Trace: {call_trace_str}") + print(action.name()) # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA kernel_pattern = rf"{call_trace_str}::(\w+)[<:]" @@ -164,17 +171,19 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): print(f"\tCould not match {demangled_kernel_name}") continue - if raja_lambda_cuda: - # RAJA_CUDA/Lambda_CUDA variant - instance_pattern = r"instance (\d+)" - instance_match = re.findall( - instance_pattern, demangled_kernel_name - ) + # RAJA_CUDA/Lambda_CUDA variant + instance_pattern = r"instance (\d+)" + instance_match = re.findall( + instance_pattern, demangled_kernel_name + ) + if instance_match: instance_num = instance_match[-1] kernel_name = kernel_str + "_" + instance_num + instance_exists = True else: # Base_CUDA variant kernel_name = kernel_str + instance_exists = False # Add kernel name to the end of the trace tuple kernel_call_trace.append(kernel_str) @@ -198,14 +207,14 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): if kernel_str in n.frame["name"] and ( f"#{instance_num}" in n.frame["name"] - if raja_lambda_cuda + if raja_lambda_cuda and instance_exists else True ) ] matched_node = matched_nodes[0] if debug: - if not raja_lambda_cuda: + if not raja_lambda_cuda or not instance_exists: instance_num = "NA" print( f"\tMatched NCU kernel:\n\t\t{demangled_kernel_name}\n\tto Caliper Node:\n\t\t{matched_node}" From 5ea81853c4a68b9ca01b9a9984694a1966f9e8af Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Fri, 4 Oct 2024 12:17:43 -0700 Subject: [PATCH 11/19] change cache to use demangled name as key for uniqueness. Add similarity matching for cub kernels --- thicket/ncu.py | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/thicket/ncu.py b/thicket/ncu.py index 559b70ad..ac5cbb26 100644 --- a/thicket/ncu.py +++ b/thicket/ncu.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: MIT from collections import defaultdict +from difflib import SequenceMatcher import re from hatchet import QueryMatcher @@ -152,12 +153,19 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): # Special case to match "cub" kernels if "cub" in demangled_kernel_name: call_trace_str = "cub" + # Replace substrings that may cause mismatch + demangled_kernel_name = demangled_kernel_name.replace( + "(bool)1", "true" + ) + demangled_kernel_name = demangled_kernel_name.replace( + "(bool)0", "false" + ) else: call_trace_str = "::".join( [s.lower() for s in temp_call_trace] ) if debug: - print(f"\tKernel Call Trace: {call_trace_str}") + print(f"\tKernel Call Trace: {kernel_call_trace}") print(action.name()) # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA @@ -190,9 +198,9 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): # Match ncu kernel to thicket node matched_node = None - if kernel_name in kernel_map: + if demangled_kernel_name in kernel_map: # Skip query building - matched_node = kernel_map[kernel_name] + matched_node = kernel_map[demangled_kernel_name] else: # kernel hasn't been seen yet # Build query query = NCUReader._build_query_from_ncu_trace( @@ -211,7 +219,25 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): else True ) ] - matched_node = matched_nodes[0] + if len(matched_nodes) > 1: + # Attempt to match using similarity + match_dict = {} + for node in matched_nodes: + match_ratio = SequenceMatcher( + None, node.frame["name"], demangled_kernel_name + ).ratio() + match_dict[node] = match_ratio + matched_node = max(match_dict, key=match_dict.get) + if debug: + print( + f"NOTICE: Multiple matches ({len(matched_nodes)}) found for kernel. Matching using string similarity..." + ) + elif len(matched_nodes) == 1: + matched_node = matched_nodes[0] + else: + raise ValueError( + "No node found for kernel: " + kernel_str + ) if debug: if not raja_lambda_cuda or not instance_exists: @@ -227,7 +253,7 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): print("\t", node) # Set mapping - kernel_map[kernel_name] = matched_node + kernel_map[demangled_kernel_name] = matched_node metric_values = [action[name].value() for name in metric_names] From ff13797d08952f971dddc299ac6ca8c660cb988a Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Fri, 4 Oct 2024 12:33:37 -0700 Subject: [PATCH 12/19] Remove unused variable --- thicket/ncu.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/thicket/ncu.py b/thicket/ncu.py index ac5cbb26..c3119658 100644 --- a/thicket/ncu.py +++ b/thicket/ncu.py @@ -186,11 +186,9 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): ) if instance_match: instance_num = instance_match[-1] - kernel_name = kernel_str + "_" + instance_num instance_exists = True else: # Base_CUDA variant - kernel_name = kernel_str instance_exists = False # Add kernel name to the end of the trace tuple From 9a9eab01cfc5b7487e1cebb09906eef02e61308f Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Wed, 23 Oct 2024 10:52:19 -0700 Subject: [PATCH 13/19] Refactor matching functions to enable unit testing --- thicket/helpers.py | 109 ++++++++++++++++++++++++ thicket/ncu.py | 91 ++++++-------------- thicket/tests/test_helpers.py | 155 ++++++++++++++++++++++++++++++++++ 3 files changed, 289 insertions(+), 66 deletions(-) create mode 100644 thicket/tests/test_helpers.py diff --git a/thicket/helpers.py b/thicket/helpers.py index c6e7314c..18324874 100644 --- a/thicket/helpers.py +++ b/thicket/helpers.py @@ -3,6 +3,9 @@ # # SPDX-License-Identifier: MIT +import re +from difflib import SequenceMatcher + from more_itertools import powerset import pandas as pd @@ -168,3 +171,109 @@ def _get_perf_columns(df): def _powerset_from_tuple(tup): pset = [y for y in powerset(tup)] return {x[0] if len(x) == 1 else x for x in pset} + + +def _match_call_trace_regex(kernel_call_trace, demangled_kernel_name, debug, action=None): + """Use the NCU call trace to regex match the kernel name from the demangled + kernel string. Also modifies the demangled kernel name in certain cases. Returns + the matched kernel string, if match is possible. + + Arguments: + kernel_call_trace (list): List of strings from NCU representing the call trace + demangled_kernel_name (str): Demangled kernel name from NCU + debug (bool): Print debug statements + action (ncu_report.IAction): NCU action object + """ + # Call trace with last element removed + # (last elem usually not useful for matching) + temp_call_trace = kernel_call_trace[:-1] + # Special case to match "cub" kernels + if "cub" in demangled_kernel_name: + call_trace_str = "cub" + # Replace substrings that may cause mismatch + demangled_kernel_name = demangled_kernel_name.replace("(bool)1", "true") + demangled_kernel_name = demangled_kernel_name.replace("(bool)0", "false") + else: + call_trace_str = "::".join([s.lower() for s in temp_call_trace]) + if debug: + print(f"\tKernel Call Trace: {kernel_call_trace}") + print(f"\t{action.name()}") + + # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA + kernel_pattern = rf"{call_trace_str}::(\w+)[<:]" + kernel_match = re.search(kernel_pattern, demangled_kernel_name) + # Found match + if kernel_match: + kernel_str = kernel_match.group(1) + else: + if debug: + print(f"\tCould not match {demangled_kernel_name}") + return None, None, None, True + + # RAJA_CUDA/Lambda_CUDA variant + instance_pattern = r"instance (\d+)" + instance_match = re.findall(instance_pattern, demangled_kernel_name) + if instance_match: + instance_num = instance_match[-1] + instance_exists = True + else: + # Base_CUDA variant + instance_num = None + instance_exists = False + + return kernel_str, demangled_kernel_name, instance_num, instance_exists, False + + +def _match_kernel_str_to_cali( + node_set, kernel_str, instance_num, raja_lambda_cuda, instance_exists +): + """Given a set of nodes, node_set, from querying the Caliper call + tree using the NCU call trace, match the kernel_str to one of the + node names. Additionally, use the instance number, instance_num to + match kernels with multiple instances, if applicable. + + Arguments: + node_set (list): List of Hatchet nodes from querying the call tree + kernel_str (str): Kernel name from _match_call_trace_regex + instance_num (int): Instance number of kernel, if applicable + raja_lambda_cuda (bool): True if RAJA_CUDA or Lambda_CUDA, False if Base_CUDA + instance_exists (bool): True if instance number exists, False if not + """ + return [ + n + for n in node_set + if kernel_str in n.frame["name"] + and ( + f"#{instance_num}" in n.frame["name"] + if raja_lambda_cuda and instance_exists + else True + ) + ] + + +def _multi_match_fallback_similarity(matched_nodes, demangled_kernel_name, debug): + """If _match_kernel_str_to_cali has more than one match, attempt to match using sequence similarity. + + Arguments: + matched_nodes (list): List of matched Hatchet nodes + demangled_kernel_name (str): Demangled kernel name from _match_call_trace_regex + debug (bool): Print debug statements + + Returns: + matched_node (Hatchet.node): Hatchet node with highest similarity score + """ + # Attempt to match using similarity + match_dict = {} + for node in matched_nodes: + match_ratio = SequenceMatcher( + None, node.frame["name"], demangled_kernel_name + ).ratio() + match_dict[match_ratio] = node + # Get highest ratio + highest_ratio = max(list(match_dict.keys())) + matched_node = match_dict[highest_ratio] + if debug: + print( + f"NOTICE: Multiple matches ({len(matched_nodes)}) found for kernel. Matching using string similarity..." + ) + return matched_node diff --git a/thicket/ncu.py b/thicket/ncu.py index c3119658..8d29029a 100644 --- a/thicket/ncu.py +++ b/thicket/ncu.py @@ -4,13 +4,16 @@ # SPDX-License-Identifier: MIT from collections import defaultdict -from difflib import SequenceMatcher -import re from hatchet import QueryMatcher import pandas as pd from tqdm import tqdm +from .helpers import ( + _match_call_trace_regex, + _match_kernel_str_to_cali, + _multi_match_fallback_similarity, +) import ncu_report @@ -147,49 +150,17 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): else: call_trace_found = True - # Call trace with last element removed - # (last elem usually not useful for matching) - temp_call_trace = kernel_call_trace[:-1] - # Special case to match "cub" kernels - if "cub" in demangled_kernel_name: - call_trace_str = "cub" - # Replace substrings that may cause mismatch - demangled_kernel_name = demangled_kernel_name.replace( - "(bool)1", "true" - ) - demangled_kernel_name = demangled_kernel_name.replace( - "(bool)0", "false" - ) - else: - call_trace_str = "::".join( - [s.lower() for s in temp_call_trace] - ) - if debug: - print(f"\tKernel Call Trace: {kernel_call_trace}") - print(action.name()) - - # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA - kernel_pattern = rf"{call_trace_str}::(\w+)[<:]" - kernel_match = re.search(kernel_pattern, demangled_kernel_name) - # Found match - if kernel_match: - kernel_str = kernel_match.group(1) - else: - if debug: - print(f"\tCould not match {demangled_kernel_name}") - continue - - # RAJA_CUDA/Lambda_CUDA variant - instance_pattern = r"instance (\d+)" - instance_match = re.findall( - instance_pattern, demangled_kernel_name + ( + kernel_str, + demangled_kernel_name, + instance_num, + instance_exists, + skip_kernel, + ) = _match_call_trace_regex( + kernel_call_trace, demangled_kernel_name, debug, action ) - if instance_match: - instance_num = instance_match[-1] - instance_exists = True - else: - # Base_CUDA variant - instance_exists = False + if skip_kernel: + continue # Add kernel name to the end of the trace tuple kernel_call_trace.append(kernel_str) @@ -207,29 +178,17 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): # Apply the query node_set = query.apply(thicket) # Find the correct node. This may also get the parent so we take the last one - matched_nodes = [ - n - for n in node_set - if kernel_str in n.frame["name"] - and ( - f"#{instance_num}" in n.frame["name"] - if raja_lambda_cuda and instance_exists - else True - ) - ] + matched_nodes = _match_kernel_str_to_cali( + node_set, + kernel_str, + instance_num, + raja_lambda_cuda, + instance_exists, + ) if len(matched_nodes) > 1: - # Attempt to match using similarity - match_dict = {} - for node in matched_nodes: - match_ratio = SequenceMatcher( - None, node.frame["name"], demangled_kernel_name - ).ratio() - match_dict[node] = match_ratio - matched_node = max(match_dict, key=match_dict.get) - if debug: - print( - f"NOTICE: Multiple matches ({len(matched_nodes)}) found for kernel. Matching using string similarity..." - ) + matched_node = _multi_match_fallback_similarity( + matched_nodes, demangled_kernel_name, debug + ) elif len(matched_nodes) == 1: matched_node = matched_nodes[0] else: diff --git a/thicket/tests/test_helpers.py b/thicket/tests/test_helpers.py new file mode 100644 index 00000000..04c0b55d --- /dev/null +++ b/thicket/tests/test_helpers.py @@ -0,0 +1,155 @@ +from thicket.helpers import ( + _match_call_trace_regex, + _match_kernel_str_to_cali, + _multi_match_fallback_similarity, +) +from hatchet.node import Node + + +def test_match_call_trace_regex(): + + # Base_CUDA variant + kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = ( + _match_call_trace_regex( + ["RAJAPerf", "Basic", "Basic_DAXPY"], + "void rajaperf::basic::daxpy<(unsigned long)128>(double *, double *, double, long)", + debug=False, + ) + ) + assert kernel_str == "daxpy" + + # lambda_CUDA variant + kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = ( + _match_call_trace_regex( + ["RAJAPerf", "Polybench", "Polybench_ATAX"], + "void rajaperf::polybench::poly_atax_lam<(unsigned long)128, void rajaperf::polybench::POLYBENCH_ATAX::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda(long) (instance 2)]>(long, T2)", + debug=False, + ) + ) + assert kernel_str == "poly_atax_lam" + + # RAJA_CUDA variant + kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = ( + _match_call_trace_regex( + ["RAJAPerf", "Apps", "Apps_ENERGY"], + "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, (unsigned long)1, (bool)1>, (unsigned long)1, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda() (instance 1)]::operator ()() const::[lambda(long) (instance 4)], long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, (unsigned long)128>(T4, T3, T5)", + debug=False, + ) + ) + assert kernel_str == "ENERGY" + + +def test_match_kernel_str_to_cali(): + # RAJA_CUDA variant + kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = ( + _match_call_trace_regex( + ["RAJAPerf", "Apps", "Apps_ENERGY"], + "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, (unsigned long)1, (bool)1>, (unsigned long)1, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda() (instance 1)]::operator ()() const::[lambda(long) (instance 4)], long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, (unsigned long)128>(T4, T3, T5)", + debug=False, + ) + ) + # Test multi-instance (for energy4) + node_set = [ + Node({"name": "RAJAPerf", "type": "function"}), + Node({"name": "Apps", "type": "function"}), + Node({"name": "Apps_ENERGY", "type": "function"}), + Node({"name": "cudaLaunchKernel", "type": "function"}), + # energy1 + Node( + { + "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#1}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#1}, RAJA::Iterators::numeric_iterator, long)", + "type": "kernel", + } + ), + # energy2 + Node( + { + "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#2}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#2}, RAJA::Iterators::numeric_iterator, long)", + "type": "kernel", + } + ), + # energy3 + Node( + { + "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#3}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#3}, RAJA::Iterators::numeric_iterator, long)", + "type": "kernel", + } + ), + # energy4 + Node( + { + "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#4}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#4}, RAJA::Iterators::numeric_iterator, long)", + "type": "kernel", + } + ), + # energy5 + Node( + { + "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#5}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#5}, RAJA::Iterators::numeric_iterator, long)", + "type": "kernel", + } + ), + # energy6 + Node( + { + "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#6}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#6}, RAJA::Iterators::numeric_iterator, long)", + "type": "kernel", + } + ), + ] + matched_nodes = _match_kernel_str_to_cali( + node_set, kernel_str, instance_num, True, instance_exists + ) + assert len(matched_nodes) == 1 + # energy4 + assert ( + matched_nodes[0].frame["name"] + == Node( + { + "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#4}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#4}, RAJA::Iterators::numeric_iterator, long)", + "type": "kernel", + } + ).frame["name"] + ) + + +def test_multi_match_fallback_similarity(): + # CUB kernels + demangled_kernel_name = "void cub::DeviceRadixSortUpsweepKernel::Policy700, (bool)1, (bool)0, double, int>(const T4 *, T5 *, T5, int, int, cub::GridEvenShare)" + kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = ( + _match_call_trace_regex( + ["RAJAPerf", "Algorithm", "Algorithm_SORT", "DeviceRadixSortUpsweepKernel"], + demangled_kernel_name=demangled_kernel_name, + debug=False, + ) + ) + node_set = [ + Node({"name": "RAJAPerf", "type": "function"}), + Node({"name": "Algorithm", "type": "function"}), + Node({"name": "Algorithm_SORT", "type": "function"}), + Node({"name": "cudaLaunchKernel", "type": "function"}), + # "false, false" wrong match + Node( + { + "name": "void cub::DeviceRadixSortUpsweepKernel::Policy700, false, false, double, int>(double const*, int*, int, int, int, cub::GridEvenShare)", + "type": "kernel", + } + ), + # "true, false" correct match + Node( + { + "name": "void cub::DeviceRadixSortUpsweepKernel::Policy700, true, false, double, int>(double const*, int*, int, int, int, cub::GridEvenShare)", + "type": "kernel", + } + ), + ] + matched_nodes = _match_kernel_str_to_cali( + node_set, kernel_str, instance_num, True, instance_exists + ) + matched_node = _multi_match_fallback_similarity( + matched_nodes, demangled_kernel_name, debug=False + ) + assert ( + matched_node.frame["name"] + == "void cub::DeviceRadixSortUpsweepKernel::Policy700, true, false, double, int>(double const*, int*, int, int, int, cub::GridEvenShare)" + ) From 9c65ac70087b298820cdac2245311fcae0734c92 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Wed, 23 Oct 2024 10:58:33 -0700 Subject: [PATCH 14/19] black --- thicket/helpers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/thicket/helpers.py b/thicket/helpers.py index 18324874..7ea6ca1b 100644 --- a/thicket/helpers.py +++ b/thicket/helpers.py @@ -173,7 +173,9 @@ def _powerset_from_tuple(tup): return {x[0] if len(x) == 1 else x for x in pset} -def _match_call_trace_regex(kernel_call_trace, demangled_kernel_name, debug, action=None): +def _match_call_trace_regex( + kernel_call_trace, demangled_kernel_name, debug, action=None +): """Use the NCU call trace to regex match the kernel name from the demangled kernel string. Also modifies the demangled kernel name in certain cases. Returns the matched kernel string, if match is possible. From 87e2c0bbde9c6a2563e96a346e49fab06fb1e9fc Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Wed, 23 Oct 2024 10:59:30 -0700 Subject: [PATCH 15/19] black --- thicket/tests/test_helpers.py | 80 ++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 30 deletions(-) diff --git a/thicket/tests/test_helpers.py b/thicket/tests/test_helpers.py index 04c0b55d..7aa1d5d9 100644 --- a/thicket/tests/test_helpers.py +++ b/thicket/tests/test_helpers.py @@ -9,44 +9,60 @@ def test_match_call_trace_regex(): # Base_CUDA variant - kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = ( - _match_call_trace_regex( - ["RAJAPerf", "Basic", "Basic_DAXPY"], - "void rajaperf::basic::daxpy<(unsigned long)128>(double *, double *, double, long)", - debug=False, - ) + ( + kernel_str, + demangled_kernel_name, + instance_num, + instance_exists, + skip_kernel, + ) = _match_call_trace_regex( + ["RAJAPerf", "Basic", "Basic_DAXPY"], + "void rajaperf::basic::daxpy<(unsigned long)128>(double *, double *, double, long)", + debug=False, ) assert kernel_str == "daxpy" # lambda_CUDA variant - kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = ( - _match_call_trace_regex( - ["RAJAPerf", "Polybench", "Polybench_ATAX"], - "void rajaperf::polybench::poly_atax_lam<(unsigned long)128, void rajaperf::polybench::POLYBENCH_ATAX::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda(long) (instance 2)]>(long, T2)", - debug=False, - ) + ( + kernel_str, + demangled_kernel_name, + instance_num, + instance_exists, + skip_kernel, + ) = _match_call_trace_regex( + ["RAJAPerf", "Polybench", "Polybench_ATAX"], + "void rajaperf::polybench::poly_atax_lam<(unsigned long)128, void rajaperf::polybench::POLYBENCH_ATAX::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda(long) (instance 2)]>(long, T2)", + debug=False, ) assert kernel_str == "poly_atax_lam" # RAJA_CUDA variant - kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = ( - _match_call_trace_regex( - ["RAJAPerf", "Apps", "Apps_ENERGY"], - "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, (unsigned long)1, (bool)1>, (unsigned long)1, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda() (instance 1)]::operator ()() const::[lambda(long) (instance 4)], long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, (unsigned long)128>(T4, T3, T5)", - debug=False, - ) + ( + kernel_str, + demangled_kernel_name, + instance_num, + instance_exists, + skip_kernel, + ) = _match_call_trace_regex( + ["RAJAPerf", "Apps", "Apps_ENERGY"], + "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, (unsigned long)1, (bool)1>, (unsigned long)1, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda() (instance 1)]::operator ()() const::[lambda(long) (instance 4)], long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, (unsigned long)128>(T4, T3, T5)", + debug=False, ) assert kernel_str == "ENERGY" def test_match_kernel_str_to_cali(): # RAJA_CUDA variant - kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = ( - _match_call_trace_regex( - ["RAJAPerf", "Apps", "Apps_ENERGY"], - "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, (unsigned long)1, (bool)1>, (unsigned long)1, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda() (instance 1)]::operator ()() const::[lambda(long) (instance 4)], long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, (unsigned long)128>(T4, T3, T5)", - debug=False, - ) + ( + kernel_str, + demangled_kernel_name, + instance_num, + instance_exists, + skip_kernel, + ) = _match_call_trace_regex( + ["RAJAPerf", "Apps", "Apps_ENERGY"], + "void RAJA::policy::cuda::impl::forall_cuda_kernel, RAJA::cuda::MaxOccupancyConcretizer, (unsigned long)1, (bool)1>, (unsigned long)1, RAJA::Iterators::numeric_iterator, void rajaperf::apps::ENERGY::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda() (instance 1)]::operator ()() const::[lambda(long) (instance 4)], long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, (unsigned long)128>(T4, T3, T5)", + debug=False, ) # Test multi-instance (for energy4) node_set = [ @@ -116,12 +132,16 @@ def test_match_kernel_str_to_cali(): def test_multi_match_fallback_similarity(): # CUB kernels demangled_kernel_name = "void cub::DeviceRadixSortUpsweepKernel::Policy700, (bool)1, (bool)0, double, int>(const T4 *, T5 *, T5, int, int, cub::GridEvenShare)" - kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = ( - _match_call_trace_regex( - ["RAJAPerf", "Algorithm", "Algorithm_SORT", "DeviceRadixSortUpsweepKernel"], - demangled_kernel_name=demangled_kernel_name, - debug=False, - ) + ( + kernel_str, + demangled_kernel_name, + instance_num, + instance_exists, + skip_kernel, + ) = _match_call_trace_regex( + ["RAJAPerf", "Algorithm", "Algorithm_SORT", "DeviceRadixSortUpsweepKernel"], + demangled_kernel_name=demangled_kernel_name, + debug=False, ) node_set = [ Node({"name": "RAJAPerf", "type": "function"}), From 01d8d4c2f175c4f76fb83c5a164865e404c7ef97 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Wed, 23 Oct 2024 14:06:12 -0700 Subject: [PATCH 16/19] Add license to file --- thicket/tests/test_helpers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/thicket/tests/test_helpers.py b/thicket/tests/test_helpers.py index 7aa1d5d9..7988c3a6 100644 --- a/thicket/tests/test_helpers.py +++ b/thicket/tests/test_helpers.py @@ -1,3 +1,8 @@ +# Copyright 2022 Lawrence Livermore National Security, LLC and other +# Thicket Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + from thicket.helpers import ( _match_call_trace_regex, _match_kernel_str_to_cali, From 51b99acbb5b8dd6efed9f548475195a330492b53 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Wed, 23 Oct 2024 15:14:33 -0700 Subject: [PATCH 17/19] Refactor import so functions can be defined in ncu.py --- thicket/helpers.py | 111 ----------------- thicket/ncu.py | 216 +++++++++++++++++++++++++--------- thicket/tests/test_helpers.py | 3 +- 3 files changed, 161 insertions(+), 169 deletions(-) diff --git a/thicket/helpers.py b/thicket/helpers.py index 7ea6ca1b..c6e7314c 100644 --- a/thicket/helpers.py +++ b/thicket/helpers.py @@ -3,9 +3,6 @@ # # SPDX-License-Identifier: MIT -import re -from difflib import SequenceMatcher - from more_itertools import powerset import pandas as pd @@ -171,111 +168,3 @@ def _get_perf_columns(df): def _powerset_from_tuple(tup): pset = [y for y in powerset(tup)] return {x[0] if len(x) == 1 else x for x in pset} - - -def _match_call_trace_regex( - kernel_call_trace, demangled_kernel_name, debug, action=None -): - """Use the NCU call trace to regex match the kernel name from the demangled - kernel string. Also modifies the demangled kernel name in certain cases. Returns - the matched kernel string, if match is possible. - - Arguments: - kernel_call_trace (list): List of strings from NCU representing the call trace - demangled_kernel_name (str): Demangled kernel name from NCU - debug (bool): Print debug statements - action (ncu_report.IAction): NCU action object - """ - # Call trace with last element removed - # (last elem usually not useful for matching) - temp_call_trace = kernel_call_trace[:-1] - # Special case to match "cub" kernels - if "cub" in demangled_kernel_name: - call_trace_str = "cub" - # Replace substrings that may cause mismatch - demangled_kernel_name = demangled_kernel_name.replace("(bool)1", "true") - demangled_kernel_name = demangled_kernel_name.replace("(bool)0", "false") - else: - call_trace_str = "::".join([s.lower() for s in temp_call_trace]) - if debug: - print(f"\tKernel Call Trace: {kernel_call_trace}") - print(f"\t{action.name()}") - - # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA - kernel_pattern = rf"{call_trace_str}::(\w+)[<:]" - kernel_match = re.search(kernel_pattern, demangled_kernel_name) - # Found match - if kernel_match: - kernel_str = kernel_match.group(1) - else: - if debug: - print(f"\tCould not match {demangled_kernel_name}") - return None, None, None, True - - # RAJA_CUDA/Lambda_CUDA variant - instance_pattern = r"instance (\d+)" - instance_match = re.findall(instance_pattern, demangled_kernel_name) - if instance_match: - instance_num = instance_match[-1] - instance_exists = True - else: - # Base_CUDA variant - instance_num = None - instance_exists = False - - return kernel_str, demangled_kernel_name, instance_num, instance_exists, False - - -def _match_kernel_str_to_cali( - node_set, kernel_str, instance_num, raja_lambda_cuda, instance_exists -): - """Given a set of nodes, node_set, from querying the Caliper call - tree using the NCU call trace, match the kernel_str to one of the - node names. Additionally, use the instance number, instance_num to - match kernels with multiple instances, if applicable. - - Arguments: - node_set (list): List of Hatchet nodes from querying the call tree - kernel_str (str): Kernel name from _match_call_trace_regex - instance_num (int): Instance number of kernel, if applicable - raja_lambda_cuda (bool): True if RAJA_CUDA or Lambda_CUDA, False if Base_CUDA - instance_exists (bool): True if instance number exists, False if not - """ - return [ - n - for n in node_set - if kernel_str in n.frame["name"] - and ( - f"#{instance_num}" in n.frame["name"] - if raja_lambda_cuda and instance_exists - else True - ) - ] - - -def _multi_match_fallback_similarity(matched_nodes, demangled_kernel_name, debug): - """If _match_kernel_str_to_cali has more than one match, attempt to match using sequence similarity. - - Arguments: - matched_nodes (list): List of matched Hatchet nodes - demangled_kernel_name (str): Demangled kernel name from _match_call_trace_regex - debug (bool): Print debug statements - - Returns: - matched_node (Hatchet.node): Hatchet node with highest similarity score - """ - # Attempt to match using similarity - match_dict = {} - for node in matched_nodes: - match_ratio = SequenceMatcher( - None, node.frame["name"], demangled_kernel_name - ).ratio() - match_dict[match_ratio] = node - # Get highest ratio - highest_ratio = max(list(match_dict.keys())) - matched_node = match_dict[highest_ratio] - if debug: - print( - f"NOTICE: Multiple matches ({len(matched_nodes)}) found for kernel. Matching using string similarity..." - ) - return matched_node diff --git a/thicket/ncu.py b/thicket/ncu.py index 8d29029a..c3842362 100644 --- a/thicket/ncu.py +++ b/thicket/ncu.py @@ -4,71 +4,164 @@ # SPDX-License-Identifier: MIT from collections import defaultdict +from difflib import SequenceMatcher +import re from hatchet import QueryMatcher import pandas as pd from tqdm import tqdm -from .helpers import ( - _match_call_trace_regex, - _match_kernel_str_to_cali, - _multi_match_fallback_similarity, -) -import ncu_report +def _match_call_trace_regex( + kernel_call_trace, demangled_kernel_name, debug, action=None +): + """Use the NCU call trace to regex match the kernel name from the demangled + kernel string. Also modifies the demangled kernel name in certain cases. Returns + the matched kernel string, if match is possible. + + Arguments: + kernel_call_trace (list): List of strings from NCU representing the call trace + demangled_kernel_name (str): Demangled kernel name from NCU + debug (bool): Print debug statements + action (ncu_report.IAction): NCU action object + """ + # Call trace with last element removed (last elem usually not useful for matching) + temp_call_trace = kernel_call_trace[:-1] + # Special case to match "cub" kernels + if "cub" in demangled_kernel_name: + call_trace_str = "cub" + # Replace substrings that may cause mismatch + demangled_kernel_name = demangled_kernel_name.replace("(bool)1", "true") + demangled_kernel_name = demangled_kernel_name.replace("(bool)0", "false") + else: + call_trace_str = "::".join([s.lower() for s in temp_call_trace]) + if debug: + print(f"\tKernel Call Trace: {kernel_call_trace}") + print(f"\t{action.name()}") + + # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA + kernel_pattern = rf"{call_trace_str}::(\w+)[<:]" + kernel_match = re.search(kernel_pattern, demangled_kernel_name) + # Found match + if kernel_match: + kernel_str = kernel_match.group(1) + else: + if debug: + print(f"\tCould not match {demangled_kernel_name}") + return None, None, None, True + + # RAJA_CUDA/Lambda_CUDA variant + instance_pattern = r"instance (\d+)" + instance_match = re.findall(instance_pattern, demangled_kernel_name) + if instance_match: + instance_num = instance_match[-1] + instance_exists = True + else: + # Base_CUDA variant + instance_num = None + instance_exists = False + + return kernel_str, demangled_kernel_name, instance_num, instance_exists, False + + +def _match_kernel_str_to_cali( + node_set, kernel_str, instance_num, raja_lambda_cuda, instance_exists +): + """Given a set of nodes, node_set, from querying the Caliper call + tree using the NCU call trace, match the kernel_str to one of the + node names. Additionally, use the instance number, instance_num to + match kernels with multiple instances, if applicable. + + Arguments: + node_set (list): List of Hatchet nodes from querying the call tree + kernel_str (str): Kernel name from _match_call_trace_regex + instance_num (int): Instance number of kernel, if applicable + raja_lambda_cuda (bool): True if RAJA_CUDA or Lambda_CUDA, False if Base_CUDA + instance_exists (bool): True if instance number exists, False if not + """ + return [ + n + for n in node_set + if kernel_str in n.frame["name"] + and ( + f"#{instance_num}" in n.frame["name"] + if raja_lambda_cuda and instance_exists + else True + ) + ] + + +def _multi_match_fallback_similarity(matched_nodes, demangled_kernel_name, debug): + """If _match_kernel_str_to_cali has more than one match, attempt to match using sequence similarity. + + Arguments: + matched_nodes (list): List of matched Hatchet nodes + demangled_kernel_name (str): Demangled kernel name from _match_call_trace_regex + debug (bool): Print debug statements + + Returns: + matched_node (Hatchet.node): Hatchet node with highest similarity score + """ + # Attempt to match using similarity + match_dict = {} + for node in matched_nodes: + match_ratio = SequenceMatcher( + None, node.frame["name"], demangled_kernel_name + ).ratio() + match_dict[match_ratio] = node + # Get highest ratio + highest_ratio = max(list(match_dict.keys())) + matched_node = match_dict[highest_ratio] + if debug: + print( + f"NOTICE: Multiple matches ({len(matched_nodes)}) found for kernel. Matching using string similarity..." + ) + return matched_node + + +def _build_query_from_ncu_trace(kernel_call_trace): + """Build QueryLanguage query from an NCU kernel call trace + + Arguments: + kernel_call_trace (list): Call trace as seen from NCU + """ + + def _predicate_builder(kernel, is_regex=False): + """Build predicate for QueryMatcher while forcing memoization -class NCUReader: - """Object to interface and pull NCU report data into Thicket""" + Arguments: + kernel (str): kernel name + is_regex (bool): whether kernel is a regex - rollup_operations = { - None: None, - ncu_report.IMetric.RollupOperation_AVG: pd.Series.mean, # 1 - ncu_report.IMetric.RollupOperation_MAX: pd.Series.max, # 2 - ncu_report.IMetric.RollupOperation_MIN: pd.Series.min, # 3 - ncu_report.IMetric.RollupOperation_SUM: pd.Series.sum, # 4 - } + Returns: + predicate (function): predicate function + """ + if is_regex: + return ( + lambda row: row["name"] + .apply(lambda x: kernel in x if x is not None else False) + .all() + ) + else: + return lambda row: row["name"].apply(lambda x: x == kernel).all() - @staticmethod - def _build_query_from_ncu_trace(kernel_call_trace): - """Build QueryLanguage query from an NCU kernel call trace + query = QueryMatcher() + for i, kernel in enumerate(kernel_call_trace): + if i == 0: + query.match(".", _predicate_builder(kernel)) + elif i == len(kernel_call_trace) - 1: + query.rel("*") + query.rel(".", _predicate_builder(kernel, is_regex=True)) + else: + query.rel(".", _predicate_builder(kernel)) - Arguments: - kernel_call_trace (list): Call trace as seen from NCU - """ + return query - def _predicate_builder(kernel, is_regex=False): - """Build predicate for QueryMatcher while forcing memoization - - Arguments: - kernel (str): kernel name - is_regex (bool): whether kernel is a regex - - Returns: - predicate (function): predicate function - """ - if is_regex: - return ( - lambda row: row["name"] - .apply(lambda x: kernel in x if x is not None else False) - .all() - ) - else: - return lambda row: row["name"].apply(lambda x: x == kernel).all() - - query = QueryMatcher() - for i, kernel in enumerate(kernel_call_trace): - if i == 0: - query.match(".", _predicate_builder(kernel)) - elif i == len(kernel_call_trace) - 1: - query.rel("*") - query.rel(".", _predicate_builder(kernel, is_regex=True)) - else: - query.rel(".", _predicate_builder(kernel)) - - return query - - @staticmethod - def _read_ncu(thicket, ncu_report_mapping, debug=False): + +class NCUReader: + """Object to interface and pull NCU report data into Thicket""" + + def _read_ncu(self, thicket, ncu_report_mapping, debug=False): """Read NCU report files and return dictionary of data. Arguments: @@ -79,6 +172,17 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): Returns: data_dict (dict): dictionary of NCU data where key is tuple, (node, profile), mapping to list of dictionaries for per-rep data that is aggregated down to one dictionary. """ + # Lazy import ncu_report + import ncu_report + + # Rollup operations + self.rollup_operations = { + None: None, + ncu_report.IMetric.RollupOperation_AVG: pd.Series.mean, # 1 + ncu_report.IMetric.RollupOperation_MAX: pd.Series.max, # 2 + ncu_report.IMetric.RollupOperation_MIN: pd.Series.min, # 3 + ncu_report.IMetric.RollupOperation_SUM: pd.Series.sum, # 4 + } # Initialize dict data_dict = defaultdict(list) @@ -172,9 +276,7 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False): matched_node = kernel_map[demangled_kernel_name] else: # kernel hasn't been seen yet # Build query - query = NCUReader._build_query_from_ncu_trace( - kernel_call_trace - ) + query = _build_query_from_ncu_trace(kernel_call_trace) # Apply the query node_set = query.apply(thicket) # Find the correct node. This may also get the parent so we take the last one diff --git a/thicket/tests/test_helpers.py b/thicket/tests/test_helpers.py index 7988c3a6..488bd600 100644 --- a/thicket/tests/test_helpers.py +++ b/thicket/tests/test_helpers.py @@ -3,11 +3,12 @@ # # SPDX-License-Identifier: MIT -from thicket.helpers import ( +from thicket.ncu import ( _match_call_trace_regex, _match_kernel_str_to_cali, _multi_match_fallback_similarity, ) + from hatchet.node import Node From 1f3494ebdb0e426d326f16f934ac6b0d29fc60e0 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Wed, 23 Oct 2024 15:17:52 -0700 Subject: [PATCH 18/19] Rename file --- thicket/tests/{test_helpers.py => test_ncu.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename thicket/tests/{test_helpers.py => test_ncu.py} (100%) diff --git a/thicket/tests/test_helpers.py b/thicket/tests/test_ncu.py similarity index 100% rename from thicket/tests/test_helpers.py rename to thicket/tests/test_ncu.py From ef8a6dd8d6bce9da4344bb9869b3a04b8ed42ad9 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Wed, 23 Oct 2024 15:44:46 -0700 Subject: [PATCH 19/19] Reorder import --- thicket/tests/test_ncu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/thicket/tests/test_ncu.py b/thicket/tests/test_ncu.py index 488bd600..d89d9b7f 100644 --- a/thicket/tests/test_ncu.py +++ b/thicket/tests/test_ncu.py @@ -3,14 +3,14 @@ # # SPDX-License-Identifier: MIT +from hatchet.node import Node + from thicket.ncu import ( _match_call_trace_regex, _match_kernel_str_to_cali, _multi_match_fallback_similarity, ) -from hatchet.node import Node - def test_match_call_trace_regex():