From b3792012669f8b01aa707d542ac0a0a504a09343 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby961.llnl.gov>
Date: Mon, 29 Jul 2024 13:03:46 -0700
Subject: [PATCH 01/19] Add error check

---
 thicket/ncu.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/thicket/ncu.py b/thicket/ncu.py
index 0e8440c5..573899bb 100644
--- a/thicket/ncu.py
+++ b/thicket/ncu.py
@@ -82,6 +82,9 @@ def _read_ncu(thicket, ncu_report_mapping):
 
         # Loop through NCU files
         for ncu_report_file in ncu_report_mapping:
+            # Set error check flag
+            call_trace_found = False
+
             # NCU hash
             profile_mapping_flipped = {v: k for k, v in thicket.profile_mapping.items()}
             ncu_hash = profile_mapping_flipped[ncu_report_mapping[ncu_report_file]]
@@ -124,6 +127,7 @@ def _read_ncu(thicket, ncu_report_mapping):
                     if len(kernel_call_trace) == 0:
                         continue
                     else:
+                        call_trace_found=True
                         # Add kernel name to the end of the trace tuple
                         kernel_call_trace.append(kernel_name)
 
@@ -155,4 +159,7 @@ def _read_ncu(thicket, ncu_report_mapping):
                             dict(zip(metric_names, metric_values))
                         )
 
+            if not call_trace_found:
+                raise ValueError(f"No kernel call traces found in {ncu_report_file}.\nCheck you are enabling the NVTX Caliper service when running NCU.")
+
         return data_dict, rollup_dict

From f3e76d592ec0736f1e0b70e62f3f24b290a8eab9 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby961.llnl.gov>
Date: Mon, 29 Jul 2024 13:06:09 -0700
Subject: [PATCH 02/19] Add error check

---
 thicket/ncu.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/thicket/ncu.py b/thicket/ncu.py
index 573899bb..1fbef15b 100644
--- a/thicket/ncu.py
+++ b/thicket/ncu.py
@@ -93,12 +93,19 @@ def _read_ncu(thicket, ncu_report_mapping):
             report = ncu_report.load_report(ncu_report_file)
 
             # Error check
-            if report.num_ranges() > 1:
+            num_ranges = report.num_ranges()
+            if num_ranges > 1:
                 raise ValueError(
                     "NCU report file "
                     + ncu_report_file
                     + " has multiple ranges. Not supported yet."
                 )
+            elif num_ranges == 0:
+                raise ValueError(
+                    "NCU report file "
+                    + ncu_report_file
+                    + " has no ranges (no data)."
+                )
             # Loop through ranges in report
             for range in report:
                 # Grab first action

From 2c2016dff650a3a62baa27cf66d2376eb7b1eace Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby961.llnl.gov>
Date: Mon, 29 Jul 2024 15:51:27 -0700
Subject: [PATCH 03/19] Enable matching RAJA_CUDA data using demangled string

---
 thicket/ncu.py     | 80 +++++++++++++++++++++++++++++++++++++---------
 thicket/thicket.py |  4 +--
 2 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/thicket/ncu.py b/thicket/ncu.py
index 1fbef15b..8c382dc2 100644
--- a/thicket/ncu.py
+++ b/thicket/ncu.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: MIT
 
 from collections import defaultdict
+import re
 
 from hatchet import QueryMatcher
 import pandas as pd
@@ -63,7 +64,7 @@ def _predicate_builder(kernel, is_regex=False):
         return query
 
     @staticmethod
-    def _read_ncu(thicket, ncu_report_mapping):
+    def _read_ncu(thicket, ncu_report_mapping, debug=False):
         """Read NCU report files and return dictionary of data.
 
         Arguments:
@@ -89,6 +90,10 @@ def _read_ncu(thicket, ncu_report_mapping):
             profile_mapping_flipped = {v: k for k, v in thicket.profile_mapping.items()}
             ncu_hash = profile_mapping_flipped[ncu_report_mapping[ncu_report_file]]
 
+            # Relevant for kernel matching
+            variant = thicket.metadata.loc[ncu_hash, "variant"]
+            raja_cuda = variant.upper() == "RAJA_CUDA"
+
             # Load file
             report = ncu_report.load_report(ncu_report_file)
 
@@ -102,9 +107,7 @@ def _read_ncu(thicket, ncu_report_mapping):
                 )
             elif num_ranges == 0:
                 raise ValueError(
-                    "NCU report file "
-                    + ncu_report_file
-                    + " has no ranges (no data)."
+                    "NCU report file " + ncu_report_file + " has no ranges (no data)."
                 )
             # Loop through ranges in report
             for range in report:
@@ -123,8 +126,10 @@ def _read_ncu(thicket, ncu_report_mapping):
                 pbar = tqdm(range)
                 for i, action in enumerate(pbar):
                     pbar.set_description(f"Processing action {i}/{len(range)}")
-                    # Name of kernel
-                    kernel_name = action.name()
+                    # Demangled name of kernel
+                    demangled_kernel_name = action.name(
+                        ncu_report.IAction.NameBase_DEMANGLED
+                    )
                     # Get NCU-side kernel trace
                     kernel_call_trace = list(
                         action.nvtx_state().domain_by_id(0).push_pop_ranges()
@@ -134,9 +139,32 @@ def _read_ncu(thicket, ncu_report_mapping):
                     if len(kernel_call_trace) == 0:
                         continue
                     else:
-                        call_trace_found=True
+                        call_trace_found = True
+
+                        # Call trace with last element removed
+                        # (last elem usually not useful for matching)
+                        temp_call_trace = kernel_call_trace[:-1]
+                        call_trace_str = "::".join([s.lower() for s in temp_call_trace])
+
+                        # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA
+                        kernel_pattern = rf"{call_trace_str}::(\w+)[<:]"
+                        kernel_match = re.search(kernel_pattern, demangled_kernel_name)
+                        kernel_str = kernel_match.group(1)
+
+                        if raja_cuda:
+                            # RAJA_CUDA variant
+                            instance_pattern = r"instance (\d+)"
+                            instance_match = re.findall(
+                                instance_pattern, demangled_kernel_name
+                            )
+                            instance_num = instance_match[-1]
+                            kernel_name = kernel_str + "_" + instance_num
+                        else:
+                            # Base_CUDA variant
+                            kernel_name = kernel_str
+
                         # Add kernel name to the end of the trace tuple
-                        kernel_call_trace.append(kernel_name)
+                        kernel_call_trace.append(kernel_str)
 
                         # Match ncu kernel to thicket node
                         matched_node = None
@@ -150,23 +178,45 @@ def _read_ncu(thicket, ncu_report_mapping):
                             )
                             # Apply the query
                             node_set = query.apply(thicket)
-                            # Find the correct node
-                            matched_node = [
-                                n for n in node_set if kernel_name in n.frame["name"]
-                            ][0]
+                            # Find the correct node. This may also get the parent so we take the last one
+                            matched_nodes = [
+                                n
+                                for n in node_set
+                                if kernel_str in n.frame["name"]
+                                and (
+                                    f"#{instance_num}" in n.frame["name"]
+                                    if raja_cuda
+                                    else True
+                                )
+                            ]
+                            matched_node = matched_nodes[0]
+
+                            if debug:
+                                if not raja_cuda:
+                                    instance_num = "NA"
+                                print(
+                                    f"Matched NCU kernel:\n\t{demangled_kernel_name}\nto Caliper Node:\n\t{matched_node}"
+                                )
+                                print(
+                                    f"AKA:\n\t{kernel_str} (instance {instance_num}) == {kernel_str} (#{instance_num})\n"
+                                )
+                                print("All matched nodes:")
+                                for node in matched_nodes:
+                                    print("\t", node)
 
-                        # matched_node should always exist at this point
-                        assert matched_node is not None
                         # Set mapping
                         kernel_map[kernel_name] = matched_node
 
                         metric_values = [action[name].value() for name in metric_names]
+
                         assert len(metric_names) == len(metric_values)
                         data_dict[(matched_node, ncu_hash)].append(
                             dict(zip(metric_names, metric_values))
                         )
 
             if not call_trace_found:
-                raise ValueError(f"No kernel call traces found in {ncu_report_file}.\nCheck you are enabling the NVTX Caliper service when running NCU.")
+                raise ValueError(
+                    f"No kernel call traces found in {ncu_report_file}.\nCheck you are enabling the NVTX Caliper service when running NCU."
+                )
 
         return data_dict, rollup_dict
diff --git a/thicket/thicket.py b/thicket/thicket.py
index 32f2c2d2..7f386fe6 100644
--- a/thicket/thicket.py
+++ b/thicket/thicket.py
@@ -554,7 +554,7 @@ def from_json(json_thicket):
         # make and return thicket?
         return th
 
-    def add_ncu(self, ncu_report_mapping, chosen_metrics=None, overwrite=False):
+    def add_ncu(self, ncu_report_mapping, chosen_metrics=None, overwrite=False, debug=False):
         """Add NCU data into the PerformanceDataFrame
 
         Arguments:
@@ -589,7 +589,7 @@ def _rep_agg_func(col):
         ncureader = NCUReader()
 
         # Dictionary of NCU data
-        data_dict, rollup_dict = ncureader._read_ncu(self, ncu_report_mapping)
+        data_dict, rollup_dict = ncureader._read_ncu(self, ncu_report_mapping, debug)
 
         # Create empty df
         ncu_df = pd.DataFrame()

From 8f76091a8362cf49547d4dfa5563d4907168c961 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby961.llnl.gov>
Date: Mon, 29 Jul 2024 16:04:05 -0700
Subject: [PATCH 04/19] black and docstring

---
 thicket/ncu.py     | 1 +
 thicket/thicket.py | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/thicket/ncu.py b/thicket/ncu.py
index 8c382dc2..12e4954d 100644
--- a/thicket/ncu.py
+++ b/thicket/ncu.py
@@ -70,6 +70,7 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
         Arguments:
             thicket (Thicket): thicket object to add ncu metrics to
             ncu_report_mapping (dict): mapping from NCU report file to profile
+            debug (bool): whether to print debug statements
 
         Returns:
             data_dict (dict): dictionary of NCU data where key is tuple, (node, profile), mapping to list of dictionaries for per-rep data that is aggregated down to one dictionary.
diff --git a/thicket/thicket.py b/thicket/thicket.py
index 7f386fe6..4534b62a 100644
--- a/thicket/thicket.py
+++ b/thicket/thicket.py
@@ -554,13 +554,16 @@ def from_json(json_thicket):
         # make and return thicket?
         return th
 
-    def add_ncu(self, ncu_report_mapping, chosen_metrics=None, overwrite=False, debug=False):
+    def add_ncu(
+            self, ncu_report_mapping, chosen_metrics=None, overwrite=False, debug=False
+        ):
         """Add NCU data into the PerformanceDataFrame
 
         Arguments:
             ncu_report_mapping (dict): mapping from NCU report file to profile
             chosen_metrics (list): list of metrics to sub-select from NCU report
             overwrite (bool): whether to overwrite existing columns in the Thicket.DataFrame
+            debug (bool): whether to print debug information
         """
 
         def _rep_agg_func(col):

From b4cb38cc6a85dac2db8813c2f93f954c08d518fb Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby961.llnl.gov>
Date: Mon, 29 Jul 2024 16:06:41 -0700
Subject: [PATCH 05/19] black

---
 thicket/thicket.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/thicket/thicket.py b/thicket/thicket.py
index 4534b62a..c616f0ad 100644
--- a/thicket/thicket.py
+++ b/thicket/thicket.py
@@ -555,8 +555,8 @@ def from_json(json_thicket):
         return th
 
     def add_ncu(
-            self, ncu_report_mapping, chosen_metrics=None, overwrite=False, debug=False
-        ):
+        self, ncu_report_mapping, chosen_metrics=None, overwrite=False, debug=False
+    ):
         """Add NCU data into the PerformanceDataFrame
 
         Arguments:

From 14145bb198552043e86a1063669b1dad5519b866 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby961.llnl.gov>
Date: Mon, 29 Jul 2024 16:22:58 -0700
Subject: [PATCH 06/19] Add Lambda_CUDA support

---
 thicket/ncu.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/thicket/ncu.py b/thicket/ncu.py
index 12e4954d..6293a398 100644
--- a/thicket/ncu.py
+++ b/thicket/ncu.py
@@ -93,7 +93,7 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
 
             # Relevant for kernel matching
             variant = thicket.metadata.loc[ncu_hash, "variant"]
-            raja_cuda = variant.upper() == "RAJA_CUDA"
+            raja_lambda_cuda = variant.upper() == "RAJA_CUDA" or variant.upper() == "LAMBDA_CUDA"
 
             # Load file
             report = ncu_report.load_report(ncu_report_file)
@@ -152,7 +152,7 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                         kernel_match = re.search(kernel_pattern, demangled_kernel_name)
                         kernel_str = kernel_match.group(1)
 
-                        if raja_cuda:
+                        if raja_lambda_cuda:
                             # RAJA_CUDA variant
                             instance_pattern = r"instance (\d+)"
                             instance_match = re.findall(
@@ -186,14 +186,14 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                                 if kernel_str in n.frame["name"]
                                 and (
                                     f"#{instance_num}" in n.frame["name"]
-                                    if raja_cuda
+                                    if raja_lambda_cuda
                                     else True
                                 )
                             ]
                             matched_node = matched_nodes[0]
 
                             if debug:
-                                if not raja_cuda:
+                                if not raja_lambda_cuda:
                                     instance_num = "NA"
                                 print(
                                     f"Matched NCU kernel:\n\t{demangled_kernel_name}\nto Caliper Node:\n\t{matched_node}"

From 5ac8720146cdd160b38e59b2d4b1452a0f874ef9 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby961.llnl.gov>
Date: Mon, 29 Jul 2024 20:04:05 -0700
Subject: [PATCH 07/19] black

---
 thicket/ncu.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/thicket/ncu.py b/thicket/ncu.py
index 6293a398..ed23ae36 100644
--- a/thicket/ncu.py
+++ b/thicket/ncu.py
@@ -93,7 +93,9 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
 
             # Relevant for kernel matching
             variant = thicket.metadata.loc[ncu_hash, "variant"]
-            raja_lambda_cuda = variant.upper() == "RAJA_CUDA" or variant.upper() == "LAMBDA_CUDA"
+            raja_lambda_cuda = (
+                variant.upper() == "RAJA_CUDA" or variant.upper() == "LAMBDA_CUDA"
+            )
 
             # Load file
             report = ncu_report.load_report(ncu_report_file)

From aa3146fe6588a74fa20c03079c012bd19ef09bb6 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@lassen34.coral.llnl.gov>
Date: Tue, 30 Jul 2024 12:27:08 -0700
Subject: [PATCH 08/19] Update docstring

---
 thicket/thicket.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thicket/thicket.py b/thicket/thicket.py
index c616f0ad..8f9f6c80 100644
--- a/thicket/thicket.py
+++ b/thicket/thicket.py
@@ -560,7 +560,7 @@ def add_ncu(
         """Add NCU data into the PerformanceDataFrame
 
         Arguments:
-            ncu_report_mapping (dict): mapping from NCU report file to profile
+            ncu_report_mapping (dict): mapping from NCU report file to Caliper CUDA Activity Profile
             chosen_metrics (list): list of metrics to sub-select from NCU report
             overwrite (bool): whether to overwrite existing columns in the Thicket.DataFrame
             debug (bool): whether to print debug information

From caa6a3cabfcda815daa8d8e186825196480d9c66 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby961.llnl.gov>
Date: Thu, 3 Oct 2024 15:13:24 -0700
Subject: [PATCH 09/19] Skip matching if different pattern. Add additional
 debug statements

---
 thicket/ncu.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/thicket/ncu.py b/thicket/ncu.py
index ed23ae36..1aa45247 100644
--- a/thicket/ncu.py
+++ b/thicket/ncu.py
@@ -129,6 +129,8 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                 pbar = tqdm(range)
                 for i, action in enumerate(pbar):
                     pbar.set_description(f"Processing action {i}/{len(range)}")
+                    if debug:
+                        print(f"Action: {i}")
                     # Demangled name of kernel
                     demangled_kernel_name = action.name(
                         ncu_report.IAction.NameBase_DEMANGLED
@@ -148,14 +150,22 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                         # (last elem usually not useful for matching)
                         temp_call_trace = kernel_call_trace[:-1]
                         call_trace_str = "::".join([s.lower() for s in temp_call_trace])
+                        if debug:
+                            print(f"\tKernel Call Trace: {call_trace_str}")
 
                         # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA
                         kernel_pattern = rf"{call_trace_str}::(\w+)[<:]"
                         kernel_match = re.search(kernel_pattern, demangled_kernel_name)
-                        kernel_str = kernel_match.group(1)
+                        # Found match
+                        if kernel_match:
+                            kernel_str = kernel_match.group(1)
+                        else:
+                            if debug:
+                                print(f"\tCould not match {demangled_kernel_name}")
+                            continue
 
                         if raja_lambda_cuda:
-                            # RAJA_CUDA variant
+                            # RAJA_CUDA/Lambda_CUDA variant
                             instance_pattern = r"instance (\d+)"
                             instance_match = re.findall(
                                 instance_pattern, demangled_kernel_name
@@ -198,12 +208,12 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                                 if not raja_lambda_cuda:
                                     instance_num = "NA"
                                 print(
-                                    f"Matched NCU kernel:\n\t{demangled_kernel_name}\nto Caliper Node:\n\t{matched_node}"
+                                    f"\tMatched NCU kernel:\n\t\t{demangled_kernel_name}\n\tto Caliper Node:\n\t\t{matched_node}"
                                 )
                                 print(
-                                    f"AKA:\n\t{kernel_str} (instance {instance_num}) == {kernel_str} (#{instance_num})\n"
+                                    f"\tAKA:\n\t\t{kernel_str} (instance {instance_num}) == {kernel_str} (#{instance_num})\n"
                                 )
-                                print("All matched nodes:")
+                                print("\tAll matched nodes:")
                                 for node in matched_nodes:
                                     print("\t", node)
 

From dbe3e5806fa7c7ecdf9914977f6f55a62dcd17ce Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby961.llnl.gov>
Date: Thu, 3 Oct 2024 16:46:49 -0700
Subject: [PATCH 10/19] Partial cub support

---
 thicket/ncu.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/thicket/ncu.py b/thicket/ncu.py
index 1aa45247..559b70ad 100644
--- a/thicket/ncu.py
+++ b/thicket/ncu.py
@@ -149,9 +149,16 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                         # Call trace with last element removed
                         # (last elem usually not useful for matching)
                         temp_call_trace = kernel_call_trace[:-1]
-                        call_trace_str = "::".join([s.lower() for s in temp_call_trace])
+                        # Special case to match "cub" kernels
+                        if "cub" in demangled_kernel_name:
+                            call_trace_str = "cub"
+                        else:
+                            call_trace_str = "::".join(
+                                [s.lower() for s in temp_call_trace]
+                            )
                         if debug:
                             print(f"\tKernel Call Trace: {call_trace_str}")
+                            print(action.name())
 
                         # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA
                         kernel_pattern = rf"{call_trace_str}::(\w+)[<:]"
@@ -164,17 +171,19 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                                 print(f"\tCould not match {demangled_kernel_name}")
                             continue
 
-                        if raja_lambda_cuda:
-                            # RAJA_CUDA/Lambda_CUDA variant
-                            instance_pattern = r"instance (\d+)"
-                            instance_match = re.findall(
-                                instance_pattern, demangled_kernel_name
-                            )
+                        # RAJA_CUDA/Lambda_CUDA variant
+                        instance_pattern = r"instance (\d+)"
+                        instance_match = re.findall(
+                            instance_pattern, demangled_kernel_name
+                        )
+                        if instance_match:
                             instance_num = instance_match[-1]
                             kernel_name = kernel_str + "_" + instance_num
+                            instance_exists = True
                         else:
                             # Base_CUDA variant
                             kernel_name = kernel_str
+                            instance_exists = False
 
                         # Add kernel name to the end of the trace tuple
                         kernel_call_trace.append(kernel_str)
@@ -198,14 +207,14 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                                 if kernel_str in n.frame["name"]
                                 and (
                                     f"#{instance_num}" in n.frame["name"]
-                                    if raja_lambda_cuda
+                                    if raja_lambda_cuda and instance_exists
                                     else True
                                 )
                             ]
                             matched_node = matched_nodes[0]
 
                             if debug:
-                                if not raja_lambda_cuda:
+                                if not raja_lambda_cuda or not instance_exists:
                                     instance_num = "NA"
                                 print(
                                     f"\tMatched NCU kernel:\n\t\t{demangled_kernel_name}\n\tto Caliper Node:\n\t\t{matched_node}"

From 5ea81853c4a68b9ca01b9a9984694a1966f9e8af Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby966.llnl.gov>
Date: Fri, 4 Oct 2024 12:17:43 -0700
Subject: [PATCH 11/19] change cache to use demangled name as key for
 uniqueness. Add similarity matching for cub kernels

---
 thicket/ncu.py | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/thicket/ncu.py b/thicket/ncu.py
index 559b70ad..ac5cbb26 100644
--- a/thicket/ncu.py
+++ b/thicket/ncu.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: MIT
 
 from collections import defaultdict
+from difflib import SequenceMatcher
 import re
 
 from hatchet import QueryMatcher
@@ -152,12 +153,19 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                         # Special case to match "cub" kernels
                         if "cub" in demangled_kernel_name:
                             call_trace_str = "cub"
+                            # Replace substrings that may cause mismatch
+                            demangled_kernel_name = demangled_kernel_name.replace(
+                                "(bool)1", "true"
+                            )
+                            demangled_kernel_name = demangled_kernel_name.replace(
+                                "(bool)0", "false"
+                            )
                         else:
                             call_trace_str = "::".join(
                                 [s.lower() for s in temp_call_trace]
                             )
                         if debug:
-                            print(f"\tKernel Call Trace: {call_trace_str}")
+                            print(f"\tKernel Call Trace: {kernel_call_trace}")
                             print(action.name())
 
                         # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA
@@ -190,9 +198,9 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
 
                         # Match ncu kernel to thicket node
                         matched_node = None
-                        if kernel_name in kernel_map:
+                        if demangled_kernel_name in kernel_map:
                             # Skip query building
-                            matched_node = kernel_map[kernel_name]
+                            matched_node = kernel_map[demangled_kernel_name]
                         else:  # kernel hasn't been seen yet
                             # Build query
                             query = NCUReader._build_query_from_ncu_trace(
@@ -211,7 +219,25 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                                     else True
                                 )
                             ]
-                            matched_node = matched_nodes[0]
+                            if len(matched_nodes) > 1:
+                                # Attempt to match using similarity
+                                match_dict = {}
+                                for node in matched_nodes:
+                                    match_ratio = SequenceMatcher(
+                                        None, node.frame["name"], demangled_kernel_name
+                                    ).ratio()
+                                    match_dict[node] = match_ratio
+                                matched_node = max(match_dict, key=match_dict.get)
+                                if debug:
+                                    print(
+                                        f"NOTICE: Multiple matches ({len(matched_nodes)}) found for kernel. Matching using string similarity..."
+                                    )
+                            elif len(matched_nodes) == 1:
+                                matched_node = matched_nodes[0]
+                            else:
+                                raise ValueError(
+                                    "No node found for kernel: " + kernel_str
+                                )
 
                             if debug:
                                 if not raja_lambda_cuda or not instance_exists:
@@ -227,7 +253,7 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                                     print("\t", node)
 
                         # Set mapping
-                        kernel_map[kernel_name] = matched_node
+                        kernel_map[demangled_kernel_name] = matched_node
 
                         metric_values = [action[name].value() for name in metric_names]
 

From ff13797d08952f971dddc299ac6ca8c660cb988a Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby966.llnl.gov>
Date: Fri, 4 Oct 2024 12:33:37 -0700
Subject: [PATCH 12/19] Remove unused variable

---
 thicket/ncu.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/thicket/ncu.py b/thicket/ncu.py
index ac5cbb26..c3119658 100644
--- a/thicket/ncu.py
+++ b/thicket/ncu.py
@@ -186,11 +186,9 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                         )
                         if instance_match:
                             instance_num = instance_match[-1]
-                            kernel_name = kernel_str + "_" + instance_num
                             instance_exists = True
                         else:
                             # Base_CUDA variant
-                            kernel_name = kernel_str
                             instance_exists = False
 
                         # Add kernel name to the end of the trace tuple

From 9a9eab01cfc5b7487e1cebb09906eef02e61308f Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby968.llnl.gov>
Date: Wed, 23 Oct 2024 10:52:19 -0700
Subject: [PATCH 13/19] Refactor matching functions to enable unit testing

---
 thicket/helpers.py            | 109 ++++++++++++++++++++++++
 thicket/ncu.py                |  91 ++++++--------------
 thicket/tests/test_helpers.py | 155 ++++++++++++++++++++++++++++++++++
 3 files changed, 289 insertions(+), 66 deletions(-)
 create mode 100644 thicket/tests/test_helpers.py

diff --git a/thicket/helpers.py b/thicket/helpers.py
index c6e7314c..18324874 100644
--- a/thicket/helpers.py
+++ b/thicket/helpers.py
@@ -3,6 +3,9 @@
 #
 # SPDX-License-Identifier: MIT
 
+import re
+from difflib import SequenceMatcher
+
 from more_itertools import powerset
 import pandas as pd
 
@@ -168,3 +171,109 @@ def _get_perf_columns(df):
 def _powerset_from_tuple(tup):
     pset = [y for y in powerset(tup)]
     return {x[0] if len(x) == 1 else x for x in pset}
+
+
+def _match_call_trace_regex(kernel_call_trace, demangled_kernel_name, debug, action=None):
+    """Use the NCU call trace to regex match the kernel name from the demangled
+    kernel string. Also modifies the demangled kernel name in certain cases. Returns
+    the matched kernel string, if match is possible.
+
+    Arguments:
+        kernel_call_trace (list): List of strings from NCU representing the call trace
+        demangled_kernel_name (str): Demangled kernel name from NCU
+        debug (bool): Print debug statements
+        action (ncu_report.IAction): NCU action object
+    """
+    # Call trace with last element removed
+    # (last elem usually not useful for matching)
+    temp_call_trace = kernel_call_trace[:-1]
+    # Special case to match "cub" kernels
+    if "cub" in demangled_kernel_name:
+        call_trace_str = "cub"
+        # Replace substrings that may cause mismatch
+        demangled_kernel_name = demangled_kernel_name.replace("(bool)1", "true")
+        demangled_kernel_name = demangled_kernel_name.replace("(bool)0", "false")
+    else:
+        call_trace_str = "::".join([s.lower() for s in temp_call_trace])
+    if debug:
+        print(f"\tKernel Call Trace: {kernel_call_trace}")
+        print(f"\t{action.name()}")
+
+    # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA
+    kernel_pattern = rf"{call_trace_str}::(\w+)[<:]"
+    kernel_match = re.search(kernel_pattern, demangled_kernel_name)
+    # Found match
+    if kernel_match:
+        kernel_str = kernel_match.group(1)
+    else:
+        if debug:
+            print(f"\tCould not match {demangled_kernel_name}")
+        return None, None, None, True
+
+    # RAJA_CUDA/Lambda_CUDA variant
+    instance_pattern = r"instance (\d+)"
+    instance_match = re.findall(instance_pattern, demangled_kernel_name)
+    if instance_match:
+        instance_num = instance_match[-1]
+        instance_exists = True
+    else:
+        # Base_CUDA variant
+        instance_num = None
+        instance_exists = False
+
+    return kernel_str, demangled_kernel_name, instance_num, instance_exists, False
+
+
+def _match_kernel_str_to_cali(
+    node_set, kernel_str, instance_num, raja_lambda_cuda, instance_exists
+):
+    """Given a set of nodes, node_set, from querying the Caliper call
+    tree using the NCU call trace, match the kernel_str to one of the
+    node names. Additionally, use the instance number, instance_num to
+    match kernels with multiple instances, if applicable.
+
+    Arguments:
+        node_set (list): List of Hatchet nodes from querying the call tree
+        kernel_str (str): Kernel name from _match_call_trace_regex
+        instance_num (int): Instance number of kernel, if applicable
+        raja_lambda_cuda (bool): True if RAJA_CUDA or Lambda_CUDA, False if Base_CUDA
+        instance_exists (bool): True if instance number exists, False if not
+    """
+    return [
+        n
+        for n in node_set
+        if kernel_str in n.frame["name"]
+        and (
+            f"#{instance_num}" in n.frame["name"]
+            if raja_lambda_cuda and instance_exists
+            else True
+        )
+    ]
+
+
+def _multi_match_fallback_similarity(matched_nodes, demangled_kernel_name, debug):
+    """If _match_kernel_str_to_cali has more than one match, attempt to match using sequence similarity.
+
+    Arguments:
+        matched_nodes (list): List of matched Hatchet nodes
+        demangled_kernel_name (str): Demangled kernel name from _match_call_trace_regex
+        debug (bool): Print debug statements
+
+    Returns:
+        matched_node (Hatchet.node): Hatchet node with highest similarity score
+    """
+    # Attempt to match using similarity
+    match_dict = {}
+    for node in matched_nodes:
+        match_ratio = SequenceMatcher(
+            None, node.frame["name"], demangled_kernel_name
+        ).ratio()
+        match_dict[match_ratio] = node
+    # Get highest ratio
+    highest_ratio = max(list(match_dict.keys()))
+    matched_node = match_dict[highest_ratio]
+    if debug:
+        print(
+            f"NOTICE: Multiple matches ({len(matched_nodes)}) found for kernel. Matching using string similarity..."
+        )
+    return matched_node
diff --git a/thicket/ncu.py b/thicket/ncu.py
index c3119658..8d29029a 100644
--- a/thicket/ncu.py
+++ b/thicket/ncu.py
@@ -4,13 +4,16 @@
 # SPDX-License-Identifier: MIT
 
 from collections import defaultdict
-from difflib import SequenceMatcher
-import re
 
 from hatchet import QueryMatcher
 import pandas as pd
 from tqdm import tqdm
 
+from .helpers import (
+    _match_call_trace_regex,
+    _match_kernel_str_to_cali,
+    _multi_match_fallback_similarity,
+)
 import ncu_report
 
 
@@ -147,49 +150,17 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                     else:
                         call_trace_found = True
 
-                        # Call trace with last element removed
-                        # (last elem usually not useful for matching)
-                        temp_call_trace = kernel_call_trace[:-1]
-                        # Special case to match "cub" kernels
-                        if "cub" in demangled_kernel_name:
-                            call_trace_str = "cub"
-                            # Replace substrings that may cause mismatch
-                            demangled_kernel_name = demangled_kernel_name.replace(
-                                "(bool)1", "true"
-                            )
-                            demangled_kernel_name = demangled_kernel_name.replace(
-                                "(bool)0", "false"
-                            )
-                        else:
-                            call_trace_str = "::".join(
-                                [s.lower() for s in temp_call_trace]
-                            )
-                        if debug:
-                            print(f"\tKernel Call Trace: {kernel_call_trace}")
-                            print(action.name())
-
-                        # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA
-                        kernel_pattern = rf"{call_trace_str}::(\w+)[<:]"
-                        kernel_match = re.search(kernel_pattern, demangled_kernel_name)
-                        # Found match
-                        if kernel_match:
-                            kernel_str = kernel_match.group(1)
-                        else:
-                            if debug:
-                                print(f"\tCould not match {demangled_kernel_name}")
-                            continue
-
-                        # RAJA_CUDA/Lambda_CUDA variant
-                        instance_pattern = r"instance (\d+)"
-                        instance_match = re.findall(
-                            instance_pattern, demangled_kernel_name
+                        (
+                            kernel_str,
+                            demangled_kernel_name,
+                            instance_num,
+                            instance_exists,
+                            skip_kernel,
+                        ) = _match_call_trace_regex(
+                            kernel_call_trace, demangled_kernel_name, debug, action
                         )
-                        if instance_match:
-                            instance_num = instance_match[-1]
-                            instance_exists = True
-                        else:
-                            # Base_CUDA variant
-                            instance_exists = False
+                        if skip_kernel:
+                            continue
 
                         # Add kernel name to the end of the trace tuple
                         kernel_call_trace.append(kernel_str)
@@ -207,29 +178,17 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                             # Apply the query
                             node_set = query.apply(thicket)
                             # Find the correct node. This may also get the parent so we take the last one
-                            matched_nodes = [
-                                n
-                                for n in node_set
-                                if kernel_str in n.frame["name"]
-                                and (
-                                    f"#{instance_num}" in n.frame["name"]
-                                    if raja_lambda_cuda and instance_exists
-                                    else True
-                                )
-                            ]
+                            matched_nodes = _match_kernel_str_to_cali(
+                                node_set,
+                                kernel_str,
+                                instance_num,
+                                raja_lambda_cuda,
+                                instance_exists,
+                            )
                             if len(matched_nodes) > 1:
-                                # Attempt to match using similarity
-                                match_dict = {}
-                                for node in matched_nodes:
-                                    match_ratio = SequenceMatcher(
-                                        None, node.frame["name"], demangled_kernel_name
-                                    ).ratio()
-                                    match_dict[node] = match_ratio
-                                matched_node = max(match_dict, key=match_dict.get)
-                                if debug:
-                                    print(
-                                        f"NOTICE: Multiple matches ({len(matched_nodes)}) found for kernel. Matching using string similarity..."
-                                    )
+                                matched_node = _multi_match_fallback_similarity(
+                                    matched_nodes, demangled_kernel_name, debug
+                                )
                             elif len(matched_nodes) == 1:
                                 matched_node = matched_nodes[0]
                             else:
diff --git a/thicket/tests/test_helpers.py b/thicket/tests/test_helpers.py
new file mode 100644
index 00000000..04c0b55d
--- /dev/null
+++ b/thicket/tests/test_helpers.py
@@ -0,0 +1,155 @@
+from thicket.helpers import (
+    _match_call_trace_regex,
+    _match_kernel_str_to_cali,
+    _multi_match_fallback_similarity,
+)
+from hatchet.node import Node
+
+
+def test_match_call_trace_regex():
+
+    # Base_CUDA variant
+    kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = (
+        _match_call_trace_regex(
+            ["RAJAPerf", "Basic", "Basic_DAXPY"],
+            "void rajaperf::basic::daxpy<(unsigned long)128>(double *, double *, double, long)",
+            debug=False,
+        )
+    )
+    assert kernel_str == "daxpy"
+
+    # lambda_CUDA variant
+    kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = (
+        _match_call_trace_regex(
+            ["RAJAPerf", "Polybench", "Polybench_ATAX"],
+            "void rajaperf::polybench::poly_atax_lam<(unsigned long)128, void rajaperf::polybench::POLYBENCH_ATAX::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda(long) (instance 2)]>(long, T2)",
+            debug=False,
+        )
+    )
+    assert kernel_str == "poly_atax_lam"
+
+    # RAJA_CUDA variant
+    kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = (
+        _match_call_trace_regex(
+            ["RAJAPerf", "Apps", "Apps_ENERGY"],
+            "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, RAJA::cuda::MaxOccupancyConcretizer, (unsigned long)1, (bool)1>, (unsigned long)1, RAJA::Iterators::numeric_iterator<long, long, long *>, void rajaperf::apps::ENERGY::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda() (instance 1)]::operator ()() const::[lambda(long) (instance 4)], long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, (unsigned long)128>(T4, T3, T5)",
+            debug=False,
+        )
+    )
+    assert kernel_str == "ENERGY"
+
+
+def test_match_kernel_str_to_cali():
+    # RAJA_CUDA variant
+    kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = (
+        _match_call_trace_regex(
+            ["RAJAPerf", "Apps", "Apps_ENERGY"],
+            "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, RAJA::cuda::MaxOccupancyConcretizer, (unsigned long)1, (bool)1>, (unsigned long)1, RAJA::Iterators::numeric_iterator<long, long, long *>, void rajaperf::apps::ENERGY::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda() (instance 1)]::operator ()() const::[lambda(long) (instance 4)], long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, (unsigned long)128>(T4, T3, T5)",
+            debug=False,
+        )
+    )
+    # Test multi-instance (for energy4)
+    node_set = [
+        Node({"name": "RAJAPerf", "type": "function"}),
+        Node({"name": "Apps", "type": "function"}),
+        Node({"name": "Apps_ENERGY", "type": "function"}),
+        Node({"name": "cudaLaunchKernel", "type": "function"}),
+        # energy1
+        Node(
+            {
+                "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator<long, long, long*>, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#1}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#1}, RAJA::Iterators::numeric_iterator<long, long, long*>, long)",
+                "type": "kernel",
+            }
+        ),
+        # energy2
+        Node(
+            {
+                "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator<long, long, long*>, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#2}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#2}, RAJA::Iterators::numeric_iterator<long, long, long*>, long)",
+                "type": "kernel",
+            }
+        ),
+        # energy3
+        Node(
+            {
+                "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator<long, long, long*>, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#3}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#3}, RAJA::Iterators::numeric_iterator<long, long, long*>, long)",
+                "type": "kernel",
+            }
+        ),
+        # energy4
+        Node(
+            {
+                "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator<long, long, long*>, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#4}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#4}, RAJA::Iterators::numeric_iterator<long, long, long*>, long)",
+                "type": "kernel",
+            }
+        ),
+        # energy5
+        Node(
+            {
+                "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator<long, long, long*>, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#5}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#5}, RAJA::Iterators::numeric_iterator<long, long, long*>, long)",
+                "type": "kernel",
+            }
+        ),
+        # energy6
+        Node(
+            {
+                "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator<long, long, long*>, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#6}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#6}, RAJA::Iterators::numeric_iterator<long, long, long*>, long)",
+                "type": "kernel",
+            }
+        ),
+    ]
+    matched_nodes = _match_kernel_str_to_cali(
+        node_set, kernel_str, instance_num, True, instance_exists
+    )
+    assert len(matched_nodes) == 1
+    # energy4
+    assert (
+        matched_nodes[0].frame["name"]
+        == Node(
+            {
+                "name": "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, RAJA::cuda::MaxOccupancyConcretizer, 1ul, true>, 1ul, RAJA::Iterators::numeric_iterator<long, long, long*>, void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#4}, long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, 128, 0>, 128ul>(void rajaperf::apps::ENERGY::runCudaVariantImpl<128ul>(rajaperf::VariantID)::{lambda()#1}::operator()() const::{lambda(long)#4}, RAJA::Iterators::numeric_iterator<long, long, long*>, long)",
+                "type": "kernel",
+            }
+        ).frame["name"]
+    )
+
+
+def test_multi_match_fallback_similarity():
+    # CUB kernels
+    demangled_kernel_name = "void cub::DeviceRadixSortUpsweepKernel<cub::DeviceRadixSortPolicy<double, cub::NullType, int>::Policy700, (bool)1, (bool)0, double, int>(const T4 *, T5 *, T5, int, int, cub::GridEvenShare<T5>)"
+    kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = (
+        _match_call_trace_regex(
+            ["RAJAPerf", "Algorithm", "Algorithm_SORT", "DeviceRadixSortUpsweepKernel"],
+            demangled_kernel_name=demangled_kernel_name,
+            debug=False,
+        )
+    )
+    node_set = [
+        Node({"name": "RAJAPerf", "type": "function"}),
+        Node({"name": "Algorithm", "type": "function"}),
+        Node({"name": "Algorithm_SORT", "type": "function"}),
+        Node({"name": "cudaLaunchKernel", "type": "function"}),
+        # "false, false" wrong match
+        Node(
+            {
+                "name": "void cub::DeviceRadixSortUpsweepKernel<cub::DeviceRadixSortPolicy<double, cub::NullType, int>::Policy700, false, false, double, int>(double const*, int*, int, int, int, cub::GridEvenShare<int>)",
+                "type": "kernel",
+            }
+        ),
+        # "true, false" correct match
+        Node(
+            {
+                "name": "void cub::DeviceRadixSortUpsweepKernel<cub::DeviceRadixSortPolicy<double, cub::NullType, int>::Policy700, true, false, double, int>(double const*, int*, int, int, int, cub::GridEvenShare<int>)",
+                "type": "kernel",
+            }
+        ),
+    ]
+    matched_nodes = _match_kernel_str_to_cali(
+        node_set, kernel_str, instance_num, True, instance_exists
+    )
+    matched_node = _multi_match_fallback_similarity(
+        matched_nodes, demangled_kernel_name, debug=False
+    )
+    assert (
+        matched_node.frame["name"]
+        == "void cub::DeviceRadixSortUpsweepKernel<cub::DeviceRadixSortPolicy<double, cub::NullType, int>::Policy700, true, false, double, int>(double const*, int*, int, int, int, cub::GridEvenShare<int>)"
+    )

From 9c65ac70087b298820cdac2245311fcae0734c92 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby968.llnl.gov>
Date: Wed, 23 Oct 2024 10:58:33 -0700
Subject: [PATCH 14/19] black

---
 thicket/helpers.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/thicket/helpers.py b/thicket/helpers.py
index 18324874..7ea6ca1b 100644
--- a/thicket/helpers.py
+++ b/thicket/helpers.py
@@ -173,7 +173,9 @@ def _powerset_from_tuple(tup):
     return {x[0] if len(x) == 1 else x for x in pset}
 
 
-def _match_call_trace_regex(kernel_call_trace, demangled_kernel_name, debug, action=None):
+def _match_call_trace_regex(
+    kernel_call_trace, demangled_kernel_name, debug, action=None
+):
     """Use the NCU call trace to regex match the kernel name from the demangled
     kernel string. Also modifies the demangled kernel name in certain cases. Returns
     the matched kernel string, if match is possible.

From 87e2c0bbde9c6a2563e96a346e49fab06fb1e9fc Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby968.llnl.gov>
Date: Wed, 23 Oct 2024 10:59:30 -0700
Subject: [PATCH 15/19] black

---
 thicket/tests/test_helpers.py | 80 ++++++++++++++++++++++-------------
 1 file changed, 50 insertions(+), 30 deletions(-)

diff --git a/thicket/tests/test_helpers.py b/thicket/tests/test_helpers.py
index 04c0b55d..7aa1d5d9 100644
--- a/thicket/tests/test_helpers.py
+++ b/thicket/tests/test_helpers.py
@@ -9,44 +9,60 @@
 def test_match_call_trace_regex():
 
     # Base_CUDA variant
-    kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = (
-        _match_call_trace_regex(
-            ["RAJAPerf", "Basic", "Basic_DAXPY"],
-            "void rajaperf::basic::daxpy<(unsigned long)128>(double *, double *, double, long)",
-            debug=False,
-        )
+    (
+        kernel_str,
+        demangled_kernel_name,
+        instance_num,
+        instance_exists,
+        skip_kernel,
+    ) = _match_call_trace_regex(
+        ["RAJAPerf", "Basic", "Basic_DAXPY"],
+        "void rajaperf::basic::daxpy<(unsigned long)128>(double *, double *, double, long)",
+        debug=False,
     )
     assert kernel_str == "daxpy"
 
     # lambda_CUDA variant
-    kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = (
-        _match_call_trace_regex(
-            ["RAJAPerf", "Polybench", "Polybench_ATAX"],
-            "void rajaperf::polybench::poly_atax_lam<(unsigned long)128, void rajaperf::polybench::POLYBENCH_ATAX::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda(long) (instance 2)]>(long, T2)",
-            debug=False,
-        )
+    (
+        kernel_str,
+        demangled_kernel_name,
+        instance_num,
+        instance_exists,
+        skip_kernel,
+    ) = _match_call_trace_regex(
+        ["RAJAPerf", "Polybench", "Polybench_ATAX"],
+        "void rajaperf::polybench::poly_atax_lam<(unsigned long)128, void rajaperf::polybench::POLYBENCH_ATAX::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda(long) (instance 2)]>(long, T2)",
+        debug=False,
     )
     assert kernel_str == "poly_atax_lam"
 
     # RAJA_CUDA variant
-    kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = (
-        _match_call_trace_regex(
-            ["RAJAPerf", "Apps", "Apps_ENERGY"],
-            "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, RAJA::cuda::MaxOccupancyConcretizer, (unsigned long)1, (bool)1>, (unsigned long)1, RAJA::Iterators::numeric_iterator<long, long, long *>, void rajaperf::apps::ENERGY::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda() (instance 1)]::operator ()() const::[lambda(long) (instance 4)], long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, (unsigned long)128>(T4, T3, T5)",
-            debug=False,
-        )
+    (
+        kernel_str,
+        demangled_kernel_name,
+        instance_num,
+        instance_exists,
+        skip_kernel,
+    ) = _match_call_trace_regex(
+        ["RAJAPerf", "Apps", "Apps_ENERGY"],
+        "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, RAJA::cuda::MaxOccupancyConcretizer, (unsigned long)1, (bool)1>, (unsigned long)1, RAJA::Iterators::numeric_iterator<long, long, long *>, void rajaperf::apps::ENERGY::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda() (instance 1)]::operator ()() const::[lambda(long) (instance 4)], long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, (unsigned long)128>(T4, T3, T5)",
+        debug=False,
     )
     assert kernel_str == "ENERGY"
 
 
 def test_match_kernel_str_to_cali():
     # RAJA_CUDA variant
-    kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = (
-        _match_call_trace_regex(
-            ["RAJAPerf", "Apps", "Apps_ENERGY"],
-            "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, RAJA::cuda::MaxOccupancyConcretizer, (unsigned long)1, (bool)1>, (unsigned long)1, RAJA::Iterators::numeric_iterator<long, long, long *>, void rajaperf::apps::ENERGY::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda() (instance 1)]::operator ()() const::[lambda(long) (instance 4)], long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, (unsigned long)128>(T4, T3, T5)",
-            debug=False,
-        )
+    (
+        kernel_str,
+        demangled_kernel_name,
+        instance_num,
+        instance_exists,
+        skip_kernel,
+    ) = _match_call_trace_regex(
+        ["RAJAPerf", "Apps", "Apps_ENERGY"],
+        "void RAJA::policy::cuda::impl::forall_cuda_kernel<RAJA::policy::cuda::cuda_exec_explicit<RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, RAJA::cuda::MaxOccupancyConcretizer, (unsigned long)1, (bool)1>, (unsigned long)1, RAJA::Iterators::numeric_iterator<long, long, long *>, void rajaperf::apps::ENERGY::runCudaVariantImpl<(unsigned long)128>(rajaperf::VariantID)::[lambda() (instance 1)]::operator ()() const::[lambda(long) (instance 4)], long, RAJA::iteration_mapping::Direct, RAJA::cuda::IndexGlobal<(RAJA::named_dim)0, (int)128, (int)0>, (unsigned long)128>(T4, T3, T5)",
+        debug=False,
     )
     # Test multi-instance (for energy4)
     node_set = [
@@ -116,12 +132,16 @@ def test_match_kernel_str_to_cali():
 def test_multi_match_fallback_similarity():
     # CUB kernels
     demangled_kernel_name = "void cub::DeviceRadixSortUpsweepKernel<cub::DeviceRadixSortPolicy<double, cub::NullType, int>::Policy700, (bool)1, (bool)0, double, int>(const T4 *, T5 *, T5, int, int, cub::GridEvenShare<T5>)"
-    kernel_str, demangled_kernel_name, instance_num, instance_exists, skip_kernel = (
-        _match_call_trace_regex(
-            ["RAJAPerf", "Algorithm", "Algorithm_SORT", "DeviceRadixSortUpsweepKernel"],
-            demangled_kernel_name=demangled_kernel_name,
-            debug=False,
-        )
+    (
+        kernel_str,
+        demangled_kernel_name,
+        instance_num,
+        instance_exists,
+        skip_kernel,
+    ) = _match_call_trace_regex(
+        ["RAJAPerf", "Algorithm", "Algorithm_SORT", "DeviceRadixSortUpsweepKernel"],
+        demangled_kernel_name=demangled_kernel_name,
+        debug=False,
     )
     node_set = [
         Node({"name": "RAJAPerf", "type": "function"}),

From 01d8d4c2f175c4f76fb83c5a164865e404c7ef97 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby968.llnl.gov>
Date: Wed, 23 Oct 2024 14:06:12 -0700
Subject: [PATCH 16/19] Add license to file

---
 thicket/tests/test_helpers.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/thicket/tests/test_helpers.py b/thicket/tests/test_helpers.py
index 7aa1d5d9..7988c3a6 100644
--- a/thicket/tests/test_helpers.py
+++ b/thicket/tests/test_helpers.py
@@ -1,3 +1,8 @@
+# Copyright 2022 Lawrence Livermore National Security, LLC and other
+# Thicket Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: MIT
+
 from thicket.helpers import (
     _match_call_trace_regex,
     _match_kernel_str_to_cali,

From 51b99acbb5b8dd6efed9f548475195a330492b53 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby968.llnl.gov>
Date: Wed, 23 Oct 2024 15:14:33 -0700
Subject: [PATCH 17/19] Refactor import so functions can be defined in ncu.py

---
 thicket/helpers.py            | 111 -----------------
 thicket/ncu.py                | 216 +++++++++++++++++++++++++---------
 thicket/tests/test_helpers.py |   3 +-
 3 files changed, 161 insertions(+), 169 deletions(-)

diff --git a/thicket/helpers.py b/thicket/helpers.py
index 7ea6ca1b..c6e7314c 100644
--- a/thicket/helpers.py
+++ b/thicket/helpers.py
@@ -3,9 +3,6 @@
 #
 # SPDX-License-Identifier: MIT
 
-import re
-from difflib import SequenceMatcher
-
 from more_itertools import powerset
 import pandas as pd
 
@@ -171,111 +168,3 @@ def _get_perf_columns(df):
 def _powerset_from_tuple(tup):
     pset = [y for y in powerset(tup)]
     return {x[0] if len(x) == 1 else x for x in pset}
-
-
-def _match_call_trace_regex(
-    kernel_call_trace, demangled_kernel_name, debug, action=None
-):
-    """Use the NCU call trace to regex match the kernel name from the demangled
-    kernel string. Also modifies the demangled kernel name in certain cases. Returns
-    the matched kernel string, if match is possible.
-
-    Arguments:
-        kernel_call_trace (list): List of strings from NCU representing the call trace
-        demangled_kernel_name (str): Demangled kernel name from NCU
-        debug (bool): Print debug statements
-        action (ncu_report.IAction): NCU action object
-    """
-    # Call trace with last element removed
-    # (last elem usually not useful for matching)
-    temp_call_trace = kernel_call_trace[:-1]
-    # Special case to match "cub" kernels
-    if "cub" in demangled_kernel_name:
-        call_trace_str = "cub"
-        # Replace substrings that may cause mismatch
-        demangled_kernel_name = demangled_kernel_name.replace("(bool)1", "true")
-        demangled_kernel_name = demangled_kernel_name.replace("(bool)0", "false")
-    else:
-        call_trace_str = "::".join([s.lower() for s in temp_call_trace])
-    if debug:
-        print(f"\tKernel Call Trace: {kernel_call_trace}")
-        print(f"\t{action.name()}")
-
-    # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA
-    kernel_pattern = rf"{call_trace_str}::(\w+)[<:]"
-    kernel_match = re.search(kernel_pattern, demangled_kernel_name)
-    # Found match
-    if kernel_match:
-        kernel_str = kernel_match.group(1)
-    else:
-        if debug:
-            print(f"\tCould not match {demangled_kernel_name}")
-        return None, None, None, True
-
-    # RAJA_CUDA/Lambda_CUDA variant
-    instance_pattern = r"instance (\d+)"
-    instance_match = re.findall(instance_pattern, demangled_kernel_name)
-    if instance_match:
-        instance_num = instance_match[-1]
-        instance_exists = True
-    else:
-        # Base_CUDA variant
-        instance_num = None
-        instance_exists = False
-
-    return kernel_str, demangled_kernel_name, instance_num, instance_exists, False
-
-
-def _match_kernel_str_to_cali(
-    node_set, kernel_str, instance_num, raja_lambda_cuda, instance_exists
-):
-    """Given a set of nodes, node_set, from querying the Caliper call
-    tree using the NCU call trace, match the kernel_str to one of the
-    node names. Additionally, use the instance number, instance_num to
-    match kernels with multiple instances, if applicable.
-
-    Arguments:
-        node_set (list): List of Hatchet nodes from querying the call tree
-        kernel_str (str): Kernel name from _match_call_trace_regex
-        instance_num (int): Instance number of kernel, if applicable
-        raja_lambda_cuda (bool): True if RAJA_CUDA or Lambda_CUDA, False if Base_CUDA
-        instance_exists (bool): True if instance number exists, False if not
-    """
-    return [
-        n
-        for n in node_set
-        if kernel_str in n.frame["name"]
-        and (
-            f"#{instance_num}" in n.frame["name"]
-            if raja_lambda_cuda and instance_exists
-            else True
-        )
-    ]
-
-
-def _multi_match_fallback_similarity(matched_nodes, demangled_kernel_name, debug):
-    """If _match_kernel_str_to_cali has more than one match, attempt to match using sequence similarity.
-
-    Arguments:
-        matched_nodes (list): List of matched Hatchet nodes
-        demangled_kernel_name (str): Demangled kernel name from _match_call_trace_regex
-        debug (bool): Print debug statements
-
-    Returns:
-        matched_node (Hatchet.node): Hatchet node with highest similarity score
-    """
-    # Attempt to match using similarity
-    match_dict = {}
-    for node in matched_nodes:
-        match_ratio = SequenceMatcher(
-            None, node.frame["name"], demangled_kernel_name
-        ).ratio()
-        match_dict[match_ratio] = node
-    # Get highest ratio
-    highest_ratio = max(list(match_dict.keys()))
-    matched_node = match_dict[highest_ratio]
-    if debug:
-        print(
-            f"NOTICE: Multiple matches ({len(matched_nodes)}) found for kernel. Matching using string similarity..."
-        )
-    return matched_node
diff --git a/thicket/ncu.py b/thicket/ncu.py
index 8d29029a..c3842362 100644
--- a/thicket/ncu.py
+++ b/thicket/ncu.py
@@ -4,71 +4,164 @@
 # SPDX-License-Identifier: MIT
 
 from collections import defaultdict
+from difflib import SequenceMatcher
+import re
 
 from hatchet import QueryMatcher
 import pandas as pd
 from tqdm import tqdm
 
-from .helpers import (
-    _match_call_trace_regex,
-    _match_kernel_str_to_cali,
-    _multi_match_fallback_similarity,
-)
-import ncu_report
 
+def _match_call_trace_regex(
+    kernel_call_trace, demangled_kernel_name, debug, action=None
+):
+    """Use the NCU call trace to regex match the kernel name from the demangled
+    kernel string. Also modifies the demangled kernel name in certain cases. Returns
+    the matched kernel string, if match is possible.
+
+    Arguments:
+        kernel_call_trace (list): List of strings from NCU representing the call trace
+        demangled_kernel_name (str): Demangled kernel name from NCU
+        debug (bool): Print debug statements
+        action (ncu_report.IAction): NCU action object
+    """
+    # Call trace with last element removed (last elem usually not useful for matching)
+    temp_call_trace = kernel_call_trace[:-1]
+    # Special case to match "cub" kernels
+    if "cub" in demangled_kernel_name:
+        call_trace_str = "cub"
+        # Replace substrings that may cause mismatch
+        demangled_kernel_name = demangled_kernel_name.replace("(bool)1", "true")
+        demangled_kernel_name = demangled_kernel_name.replace("(bool)0", "false")
+    else:
+        call_trace_str = "::".join([s.lower() for s in temp_call_trace])
+    if debug:
+        print(f"\tKernel Call Trace: {kernel_call_trace}")
+        print(f"\t{action.name()}")
+
+    # Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA
+    kernel_pattern = rf"{call_trace_str}::(\w+)[<:]"
+    kernel_match = re.search(kernel_pattern, demangled_kernel_name)
+    # Found match
+    if kernel_match:
+        kernel_str = kernel_match.group(1)
+    else:
+        if debug:
+            print(f"\tCould not match {demangled_kernel_name}")
+        return None, None, None, True
+
+    # RAJA_CUDA/Lambda_CUDA variant
+    instance_pattern = r"instance (\d+)"
+    instance_match = re.findall(instance_pattern, demangled_kernel_name)
+    if instance_match:
+        instance_num = instance_match[-1]
+        instance_exists = True
+    else:
+        # Base_CUDA variant
+        instance_num = None
+        instance_exists = False
+
+    return kernel_str, demangled_kernel_name, instance_num, instance_exists, False
+
+
+def _match_kernel_str_to_cali(
+    node_set, kernel_str, instance_num, raja_lambda_cuda, instance_exists
+):
+    """Given a set of nodes, node_set, from querying the Caliper call
+    tree using the NCU call trace, match the kernel_str to one of the
+    node names. Additionally, use the instance number, instance_num to
+    match kernels with multiple instances, if applicable.
+
+    Arguments:
+        node_set (list): List of Hatchet nodes from querying the call tree
+        kernel_str (str): Kernel name from _match_call_trace_regex
+        instance_num (int): Instance number of kernel, if applicable
+        raja_lambda_cuda (bool): True if RAJA_CUDA or Lambda_CUDA, False if Base_CUDA
+        instance_exists (bool): True if instance number exists, False if not
+    """
+    return [
+        n
+        for n in node_set
+        if kernel_str in n.frame["name"]
+        and (
+            f"#{instance_num}" in n.frame["name"]
+            if raja_lambda_cuda and instance_exists
+            else True
+        )
+    ]
+
+
+def _multi_match_fallback_similarity(matched_nodes, demangled_kernel_name, debug):
+    """If _match_kernel_str_to_cali has more than one match, attempt to match using sequence similarity.
+
+    Arguments:
+        matched_nodes (list): List of matched Hatchet nodes
+        demangled_kernel_name (str): Demangled kernel name from _match_call_trace_regex
+        debug (bool): Print debug statements
+
+    Returns:
+        matched_node (Hatchet.node): Hatchet node with highest similarity score
+    """
+    # Attempt to match using similarity
+    match_dict = {}
+    for node in matched_nodes:
+        match_ratio = SequenceMatcher(
+            None, node.frame["name"], demangled_kernel_name
+        ).ratio()
+        match_dict[match_ratio] = node
+    # Get highest ratio
+    highest_ratio = max(list(match_dict.keys()))
+    matched_node = match_dict[highest_ratio]
+    if debug:
+        print(
+            f"NOTICE: Multiple matches ({len(matched_nodes)}) found for kernel. Matching using string similarity..."
+        )
+    return matched_node
+
+
+def _build_query_from_ncu_trace(kernel_call_trace):
+    """Build QueryLanguage query from an NCU kernel call trace
+
+    Arguments:
+        kernel_call_trace (list): Call trace as seen from NCU
+    """
+
+    def _predicate_builder(kernel, is_regex=False):
+        """Build predicate for QueryMatcher while forcing memoization
 
-class NCUReader:
-    """Object to interface and pull NCU report data into Thicket"""
+        Arguments:
+            kernel (str): kernel name
+            is_regex (bool): whether kernel is a regex
 
-    rollup_operations = {
-        None: None,
-        ncu_report.IMetric.RollupOperation_AVG: pd.Series.mean,  # 1
-        ncu_report.IMetric.RollupOperation_MAX: pd.Series.max,  # 2
-        ncu_report.IMetric.RollupOperation_MIN: pd.Series.min,  # 3
-        ncu_report.IMetric.RollupOperation_SUM: pd.Series.sum,  # 4
-    }
+        Returns:
+            predicate (function): predicate function
+        """
+        if is_regex:
+            return (
+                lambda row: row["name"]
+                .apply(lambda x: kernel in x if x is not None else False)
+                .all()
+            )
+        else:
+            return lambda row: row["name"].apply(lambda x: x == kernel).all()
 
-    @staticmethod
-    def _build_query_from_ncu_trace(kernel_call_trace):
-        """Build QueryLanguage query from an NCU kernel call trace
+    query = QueryMatcher()
+    for i, kernel in enumerate(kernel_call_trace):
+        if i == 0:
+            query.match(".", _predicate_builder(kernel))
+        elif i == len(kernel_call_trace) - 1:
+            query.rel("*")
+            query.rel(".", _predicate_builder(kernel, is_regex=True))
+        else:
+            query.rel(".", _predicate_builder(kernel))
 
-        Arguments:
-            kernel_call_trace (list): Call trace as seen from NCU
-        """
+    return query
 
-        def _predicate_builder(kernel, is_regex=False):
-            """Build predicate for QueryMatcher while forcing memoization
-
-            Arguments:
-                kernel (str): kernel name
-                is_regex (bool): whether kernel is a regex
-
-            Returns:
-                predicate (function): predicate function
-            """
-            if is_regex:
-                return (
-                    lambda row: row["name"]
-                    .apply(lambda x: kernel in x if x is not None else False)
-                    .all()
-                )
-            else:
-                return lambda row: row["name"].apply(lambda x: x == kernel).all()
-
-        query = QueryMatcher()
-        for i, kernel in enumerate(kernel_call_trace):
-            if i == 0:
-                query.match(".", _predicate_builder(kernel))
-            elif i == len(kernel_call_trace) - 1:
-                query.rel("*")
-                query.rel(".", _predicate_builder(kernel, is_regex=True))
-            else:
-                query.rel(".", _predicate_builder(kernel))
-
-        return query
-
-    @staticmethod
-    def _read_ncu(thicket, ncu_report_mapping, debug=False):
+
+class NCUReader:
+    """Object to interface and pull NCU report data into Thicket"""
+
+    def _read_ncu(self, thicket, ncu_report_mapping, debug=False):
         """Read NCU report files and return dictionary of data.
 
         Arguments:
@@ -79,6 +172,17 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
         Returns:
             data_dict (dict): dictionary of NCU data where key is tuple, (node, profile), mapping to list of dictionaries for per-rep data that is aggregated down to one dictionary.
         """
+        # Lazy import ncu_report
+        import ncu_report
+
+        # Rollup operations
+        self.rollup_operations = {
+            None: None,
+            ncu_report.IMetric.RollupOperation_AVG: pd.Series.mean,  # 1
+            ncu_report.IMetric.RollupOperation_MAX: pd.Series.max,  # 2
+            ncu_report.IMetric.RollupOperation_MIN: pd.Series.min,  # 3
+            ncu_report.IMetric.RollupOperation_SUM: pd.Series.sum,  # 4
+        }
 
         # Initialize dict
         data_dict = defaultdict(list)
@@ -172,9 +276,7 @@ def _read_ncu(thicket, ncu_report_mapping, debug=False):
                             matched_node = kernel_map[demangled_kernel_name]
                         else:  # kernel hasn't been seen yet
                             # Build query
-                            query = NCUReader._build_query_from_ncu_trace(
-                                kernel_call_trace
-                            )
+                            query = _build_query_from_ncu_trace(kernel_call_trace)
                             # Apply the query
                             node_set = query.apply(thicket)
                             # Find the correct node. This may also get the parent so we take the last one
diff --git a/thicket/tests/test_helpers.py b/thicket/tests/test_helpers.py
index 7988c3a6..488bd600 100644
--- a/thicket/tests/test_helpers.py
+++ b/thicket/tests/test_helpers.py
@@ -3,11 +3,12 @@
 #
 # SPDX-License-Identifier: MIT
 
-from thicket.helpers import (
+from thicket.ncu import (
     _match_call_trace_regex,
     _match_kernel_str_to_cali,
     _multi_match_fallback_similarity,
 )
+
 from hatchet.node import Node
 
 

From 1f3494ebdb0e426d326f16f934ac6b0d29fc60e0 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby968.llnl.gov>
Date: Wed, 23 Oct 2024 15:17:52 -0700
Subject: [PATCH 18/19] Rename file

---
 thicket/tests/{test_helpers.py => test_ncu.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename thicket/tests/{test_helpers.py => test_ncu.py} (100%)

diff --git a/thicket/tests/test_helpers.py b/thicket/tests/test_ncu.py
similarity index 100%
rename from thicket/tests/test_helpers.py
rename to thicket/tests/test_ncu.py

From ef8a6dd8d6bce9da4344bb9869b3a04b8ed42ad9 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@ruby968.llnl.gov>
Date: Wed, 23 Oct 2024 15:44:46 -0700
Subject: [PATCH 19/19] Reorder import

---
 thicket/tests/test_ncu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/thicket/tests/test_ncu.py b/thicket/tests/test_ncu.py
index 488bd600..d89d9b7f 100644
--- a/thicket/tests/test_ncu.py
+++ b/thicket/tests/test_ncu.py
@@ -3,14 +3,14 @@
 #
 # SPDX-License-Identifier: MIT
 
+from hatchet.node import Node
+
 from thicket.ncu import (
     _match_call_trace_regex,
     _match_kernel_str_to_cali,
     _multi_match_fallback_similarity,
 )
 
-from hatchet.node import Node
-
 
 def test_match_call_trace_regex():