Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NCU Reader Support for RAJA_CUDA and Lambda_CUDA #201

Open
wants to merge 19 commits into
base: develop
Choose a base branch
from
297 changes: 234 additions & 63 deletions thicket/ncu.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,75 +4,185 @@
# SPDX-License-Identifier: MIT

from collections import defaultdict
from difflib import SequenceMatcher
import re

from hatchet import QueryMatcher
import pandas as pd
from tqdm import tqdm

import ncu_report

def _match_call_trace_regex(
kernel_call_trace, demangled_kernel_name, debug, action=None
):
"""Use the NCU call trace to regex match the kernel name from the demangled
kernel string. Also modifies the demangled kernel name in certain cases. Returns
the matched kernel string, if match is possible.

class NCUReader:
"""Object to interface and pull NCU report data into Thicket"""
Arguments:
kernel_call_trace (list): List of strings from NCU representing the call trace
demangled_kernel_name (str): Demangled kernel name from NCU
debug (bool): Print debug statements
action (ncu_report.IAction): NCU action object
"""
# Call trace with last element removed (last elem usually not useful for matching)
temp_call_trace = kernel_call_trace[:-1]
# Special case to match "cub" kernels
if "cub" in demangled_kernel_name:
call_trace_str = "cub"
# Replace substrings that may cause mismatch
demangled_kernel_name = demangled_kernel_name.replace("(bool)1", "true")
demangled_kernel_name = demangled_kernel_name.replace("(bool)0", "false")
else:
call_trace_str = "::".join([s.lower() for s in temp_call_trace])
if debug:
print(f"\tKernel Call Trace: {kernel_call_trace}")
print(f"\t{action.name()}")

# Pattern ends with ":" if RAJA_CUDA, "<" if Base_CUDA
kernel_pattern = rf"{call_trace_str}::(\w+)[<:]"
kernel_match = re.search(kernel_pattern, demangled_kernel_name)
# Found match
if kernel_match:
kernel_str = kernel_match.group(1)
else:
if debug:
print(f"\tCould not match {demangled_kernel_name}")
return None, None, None, True

# RAJA_CUDA/Lambda_CUDA variant
instance_pattern = r"instance (\d+)"
instance_match = re.findall(instance_pattern, demangled_kernel_name)
if instance_match:
instance_num = instance_match[-1]
instance_exists = True
else:
# Base_CUDA variant
instance_num = None
instance_exists = False

return kernel_str, demangled_kernel_name, instance_num, instance_exists, False


def _match_kernel_str_to_cali(
node_set, kernel_str, instance_num, raja_lambda_cuda, instance_exists
):
"""Given a set of nodes, node_set, from querying the Caliper call
tree using the NCU call trace, match the kernel_str to one of the
node names. Additionally, use the instance number, instance_num to
match kernels with multiple instances, if applicable.

Arguments:
node_set (list): List of Hatchet nodes from querying the call tree
kernel_str (str): Kernel name from _match_call_trace_regex
instance_num (int): Instance number of kernel, if applicable
raja_lambda_cuda (bool): True if RAJA_CUDA or Lambda_CUDA, False if Base_CUDA
instance_exists (bool): True if instance number exists, False if not
"""
return [
n
for n in node_set
if kernel_str in n.frame["name"]
and (
f"#{instance_num}" in n.frame["name"]
if raja_lambda_cuda and instance_exists
else True
)
]


def _multi_match_fallback_similarity(matched_nodes, demangled_kernel_name, debug):
"""If _match_kernel_str_to_cali has more than one match, attempt to match using sequence similarity.

rollup_operations = {
None: None,
ncu_report.IMetric.RollupOperation_AVG: pd.Series.mean, # 1
ncu_report.IMetric.RollupOperation_MAX: pd.Series.max, # 2
ncu_report.IMetric.RollupOperation_MIN: pd.Series.min, # 3
ncu_report.IMetric.RollupOperation_SUM: pd.Series.sum, # 4
}
Arguments:
matched_nodes (list): List of matched Hatchet nodes
demangled_kernel_name (str): Demangled kernel name from _match_call_trace_regex
debug (bool): Print debug statements

@staticmethod
def _build_query_from_ncu_trace(kernel_call_trace):
"""Build QueryLanguage query from an NCU kernel call trace
Returns:
matched_node (Hatchet.node): Hatchet node with highest similarity score
"""
# Attempt to match using similarity
match_dict = {}
for node in matched_nodes:
match_ratio = SequenceMatcher(
None, node.frame["name"], demangled_kernel_name
).ratio()
match_dict[match_ratio] = node
# Get highest ratio
highest_ratio = max(list(match_dict.keys()))
matched_node = match_dict[highest_ratio]
if debug:
print(
f"NOTICE: Multiple matches ({len(matched_nodes)}) found for kernel. Matching using string similarity..."
)
return matched_node


def _build_query_from_ncu_trace(kernel_call_trace):
"""Build QueryLanguage query from an NCU kernel call trace

Arguments:
kernel_call_trace (list): Call trace as seen from NCU
"""

def _predicate_builder(kernel, is_regex=False):
"""Build predicate for QueryMatcher while forcing memoization

Arguments:
kernel_call_trace (list): Call trace as seen from NCU
kernel (str): kernel name
is_regex (bool): whether kernel is a regex

Returns:
predicate (function): predicate function
"""
if is_regex:
return (
lambda row: row["name"]
.apply(lambda x: kernel in x if x is not None else False)
.all()
)
else:
return lambda row: row["name"].apply(lambda x: x == kernel).all()

def _predicate_builder(kernel, is_regex=False):
"""Build predicate for QueryMatcher while forcing memoization

Arguments:
kernel (str): kernel name
is_regex (bool): whether kernel is a regex

Returns:
predicate (function): predicate function
"""
if is_regex:
return (
lambda row: row["name"]
.apply(lambda x: kernel in x if x is not None else False)
.all()
)
else:
return lambda row: row["name"].apply(lambda x: x == kernel).all()

query = QueryMatcher()
for i, kernel in enumerate(kernel_call_trace):
if i == 0:
query.match(".", _predicate_builder(kernel))
elif i == len(kernel_call_trace) - 1:
query.rel("*")
query.rel(".", _predicate_builder(kernel, is_regex=True))
else:
query.rel(".", _predicate_builder(kernel))

return query

@staticmethod
def _read_ncu(thicket, ncu_report_mapping):
query = QueryMatcher()
for i, kernel in enumerate(kernel_call_trace):
if i == 0:
query.match(".", _predicate_builder(kernel))
elif i == len(kernel_call_trace) - 1:
query.rel("*")
query.rel(".", _predicate_builder(kernel, is_regex=True))
else:
query.rel(".", _predicate_builder(kernel))

return query


class NCUReader:
"""Object to interface and pull NCU report data into Thicket"""

def _read_ncu(self, thicket, ncu_report_mapping, debug=False):
"""Read NCU report files and return dictionary of data.

Arguments:
thicket (Thicket): thicket object to add ncu metrics to
ncu_report_mapping (dict): mapping from NCU report file to profile
debug (bool): whether to print debug statements

Returns:
data_dict (dict): dictionary of NCU data where key is tuple, (node, profile), mapping to list of dictionaries for per-rep data that is aggregated down to one dictionary.
"""
# Lazy import ncu_report
import ncu_report

# Rollup operations
self.rollup_operations = {
None: None,
ncu_report.IMetric.RollupOperation_AVG: pd.Series.mean, # 1
ncu_report.IMetric.RollupOperation_MAX: pd.Series.max, # 2
ncu_report.IMetric.RollupOperation_MIN: pd.Series.min, # 3
ncu_report.IMetric.RollupOperation_SUM: pd.Series.sum, # 4
}

# Initialize dict
data_dict = defaultdict(list)
Expand All @@ -82,20 +192,34 @@ def _read_ncu(thicket, ncu_report_mapping):

# Loop through NCU files
for ncu_report_file in ncu_report_mapping:
# Set error check flag
call_trace_found = False

# NCU hash
profile_mapping_flipped = {v: k for k, v in thicket.profile_mapping.items()}
ncu_hash = profile_mapping_flipped[ncu_report_mapping[ncu_report_file]]

# Relevant for kernel matching
variant = thicket.metadata.loc[ncu_hash, "variant"]
raja_lambda_cuda = (
variant.upper() == "RAJA_CUDA" or variant.upper() == "LAMBDA_CUDA"
)

# Load file
report = ncu_report.load_report(ncu_report_file)

# Error check
if report.num_ranges() > 1:
num_ranges = report.num_ranges()
if num_ranges > 1:
raise ValueError(
"NCU report file "
+ ncu_report_file
+ " has multiple ranges. Not supported yet."
)
elif num_ranges == 0:
raise ValueError(
"NCU report file " + ncu_report_file + " has no ranges (no data)."
)
# Loop through ranges in report
for range in report:
# Grab first action
Expand All @@ -113,8 +237,12 @@ def _read_ncu(thicket, ncu_report_mapping):
pbar = tqdm(range)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can they disable tqdm?

for i, action in enumerate(pbar):
pbar.set_description(f"Processing action {i}/{len(range)}")
# Name of kernel
kernel_name = action.name()
if debug:
print(f"Action: {i}")
# Demangled name of kernel
demangled_kernel_name = action.name(
ncu_report.IAction.NameBase_DEMANGLED
)
# Get NCU-side kernel trace
kernel_call_trace = list(
action.nvtx_state().domain_by_id(0).push_pop_ranges()
Expand All @@ -124,35 +252,78 @@ def _read_ncu(thicket, ncu_report_mapping):
if len(kernel_call_trace) == 0:
continue
else:
call_trace_found = True

(
kernel_str,
demangled_kernel_name,
instance_num,
instance_exists,
skip_kernel,
) = _match_call_trace_regex(
kernel_call_trace, demangled_kernel_name, debug, action
)
if skip_kernel:
continue

# Add kernel name to the end of the trace tuple
kernel_call_trace.append(kernel_name)
kernel_call_trace.append(kernel_str)

# Match ncu kernel to thicket node
matched_node = None
if kernel_name in kernel_map:
if demangled_kernel_name in kernel_map:
# Skip query building
matched_node = kernel_map[kernel_name]
matched_node = kernel_map[demangled_kernel_name]
else: # kernel hasn't been seen yet
# Build query
query = NCUReader._build_query_from_ncu_trace(
kernel_call_trace
)
query = _build_query_from_ncu_trace(kernel_call_trace)
# Apply the query
node_set = query.apply(thicket)
# Find the correct node
matched_node = [
n for n in node_set if kernel_name in n.frame["name"]
][0]
# Find the correct node. This may also get the parent so we take the last one
matched_nodes = _match_kernel_str_to_cali(
node_set,
kernel_str,
instance_num,
raja_lambda_cuda,
instance_exists,
)
if len(matched_nodes) > 1:
matched_node = _multi_match_fallback_similarity(
matched_nodes, demangled_kernel_name, debug
)
elif len(matched_nodes) == 1:
matched_node = matched_nodes[0]
else:
raise ValueError(
"No node found for kernel: " + kernel_str
)

if debug:
if not raja_lambda_cuda or not instance_exists:
instance_num = "NA"
print(
f"\tMatched NCU kernel:\n\t\t{demangled_kernel_name}\n\tto Caliper Node:\n\t\t{matched_node}"
)
print(
f"\tAKA:\n\t\t{kernel_str} (instance {instance_num}) == {kernel_str} (#{instance_num})\n"
)
print("\tAll matched nodes:")
for node in matched_nodes:
print("\t", node)

# matched_node should always exist at this point
assert matched_node is not None
# Set mapping
kernel_map[kernel_name] = matched_node
kernel_map[demangled_kernel_name] = matched_node

metric_values = [action[name].value() for name in metric_names]

assert len(metric_names) == len(metric_values)
data_dict[(matched_node, ncu_hash)].append(
dict(zip(metric_names, metric_values))
)

if not call_trace_found:
raise ValueError(
f"No kernel call traces found in {ncu_report_file}.\nCheck you are enabling the NVTX Caliper service when running NCU."
)

return data_dict, rollup_dict
Loading
Loading