Skip to content

Commit

Permalink
Add CTS for scratch register reading.
Browse files Browse the repository at this point in the history
  • Loading branch information
aviralni committed Oct 16, 2024
1 parent 32527cc commit 4697059
Show file tree
Hide file tree
Showing 4 changed files with 273 additions and 0 deletions.
138 changes: 138 additions & 0 deletions conformance_tests/tools/debug/src/test_debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "test_debug.hpp"
#include "test_debug_utils.hpp"
#include "test_harness/zet_intel_gpu_debug.h"

namespace lzt = level_zero_tests;

Expand Down Expand Up @@ -1376,6 +1377,143 @@ void zetDebugReadWriteRegistersTest::run_read_write_registers_test(
}
}

void zetDebugReadWriteRegistersTest::run_read_registers_test(
std::vector<ze_device_handle_t> &devices, bool use_sub_devices) {
for (auto &device : devices) {
print_device(device);
if (!is_debug_supported(device))
continue;

synchro->clear_debugger_signal();
debugHelper = launch_process(LONG_RUNNING_KERNEL_INTERRUPTED_SCRATCH,
device, use_sub_devices);

zet_debug_event_t module_event;
attach_and_get_module_event(debugHelper.id(), synchro, device, debugSession,
module_event);

if (module_event.flags & ZET_DEBUG_EVENT_FLAG_NEED_ACK) {
LOG_DEBUG << "[Debugger] Acking event: "
<< lzt::debuggerEventTypeString[module_event.type];
lzt::debug_ack_event(debugSession, &module_event);
}

uint64_t gpu_buffer_va = 0;
synchro->wait_for_application_signal();
if (!synchro->get_app_gpu_buffer_address(gpu_buffer_va)) {
FAIL() << "[Debugger] Could not get a valid GPU buffer VA";
}
synchro->clear_application_signal();

zet_debug_memory_space_desc_t memorySpaceDesc;
memorySpaceDesc.type = ZET_DEBUG_MEMORY_SPACE_TYPE_DEFAULT;
int sizeToRead = 512;
uint8_t *kernel_buffer = new uint8_t[sizeToRead];
// set buffer[0] to 0 to break the loop. See debug_loop_slm.cl
kernel_buffer[0] = 0;
memorySpaceDesc.address = gpu_buffer_va;

ze_device_thread_t device_threads = {};
device_threads.slice = UINT32_MAX;
device_threads.subslice = UINT32_MAX;
device_threads.eu = UINT32_MAX;
device_threads.thread = UINT32_MAX;

LOG_INFO << "[Debugger] Stopping all device threads";
// give time to app to launch the kernel
std::this_thread::sleep_for(std::chrono::seconds(6));
lzt::debug_interrupt(debugSession, device_threads);

std::vector<ze_device_thread_t> stopped_threads;
if (!find_stopped_threads(debugSession, device, device_threads, true,
stopped_threads)) {
delete[] kernel_buffer;
FAIL() << "[Debugger] Did not find stopped threads";
}

bool is_heapless_mode = false;
LOG_INFO << "[Debugger] Reading/Writing Thread Scratch Register on "
"interrupted threads";

for (auto &stopped_thread : stopped_threads) {
uint8_t *mode_values = nullptr;
auto register_set_properties = lzt::get_register_set_properties(device);
for (auto &register_set : register_set_properties){

if (register_set.type == ZET_DEBUG_REGSET_TYPE_MODE_FLAGS_INTEL_GPU){
auto reg_size_in_bytes = register_set.count * register_set.byteSize;
mode_values = new uint8_t[reg_size_in_bytes];
ASSERT_EQ(
zetDebugReadRegisters(debugSession, stopped_thread,
ZET_DEBUG_REGSET_TYPE_MODE_FLAGS_INTEL_GPU,
0, register_set.count, mode_values),
ZE_RESULT_SUCCESS);

uint32_t *uint32_t_values = (uint32_t *)mode_values;
LOG_DEBUG << "[Debugger] mode value: %u " << uint32_t_values[0];
is_heapless_mode =
(uint32_t_values[0] & ZET_DEBUG_MODE_FLAG_HEAPLESS);
}
}

if (is_heapless_mode) {
for (auto &register_set : register_set_properties){
if ((register_set.type ==
ZET_DEBUG_REGSET_TYPE_THREAD_SCRATCH_INTEL_GPU) &&
(register_set.generalFlags & ZET_DEBUG_REGSET_FLAG_READABLE)) {
LOG_DEBUG << "[Debugger] Register set type " << register_set.type
<< " is readable";
size_t reg_size_in_bytes =
register_set.count * register_set.byteSize;

uint64_t *thread_scratch_reg_values =
new uint64_t[reg_size_in_bytes];
ASSERT_EQ(zetDebugReadRegisters(
debugSession, stopped_thread,
ZET_DEBUG_REGSET_TYPE_DEBUG_SCRATCH_INTEL_GPU, 0,
register_set.count, thread_scratch_reg_values),
ZE_RESULT_SUCCESS);
} else {
LOG_INFO << "[Debugger] Register set type " << register_set.type
<< " is NOT readable";
}
if (register_set.generalFlags & ZET_DEBUG_REGSET_FLAG_WRITEABLE) {
} else {
LOG_INFO << "[Debugger] Register set " << register_set.type
<< " type is NOT writeable";
}
}
}
}

lzt::debug_write_memory(debugSession, device_threads, memorySpaceDesc, 1,
kernel_buffer);
delete[] kernel_buffer;

LOG_INFO << "[Debugger] resuming interrupted threads";
lzt::debug_resume(debugSession, device_threads);
debugHelper.wait();

std::vector<zet_debug_event_type_t> expectedEvents = {
ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD, ZET_DEBUG_EVENT_TYPE_PROCESS_EXIT};

if (!check_events(debugSession, expectedEvents)) {
FAIL() << "[Debugger] Did not receive expected events";
}

lzt::debug_detach(debugSession);
ASSERT_EQ(debugHelper.exit_code(), 0);
}
}

TEST_F(
zetDebugReadWriteRegistersTest,
GivenActiveDebugSessionWhenReadingScratchRegistersThenDataReadIsDoneSuccessfully) {
auto driver = lzt::get_default_driver();
auto devices = lzt::get_devices(driver);
run_read_registers_test(devices, false);
}

TEST_F(
zetDebugReadWriteRegistersTest,
GivenActiveDebugSessionWhenReadingAndWritingRegistersThenValidDataReadAndDataWrittenSuccessfully) {
Expand Down
2 changes: 2 additions & 0 deletions conformance_tests/tools/debug/src/test_debug.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ class zetDebugReadWriteRegistersTest : public zetDebugMemAccessTest {
void TearDown() override { zetDebugMemAccessTest::TearDown(); }
void run_read_write_registers_test(std::vector<ze_device_handle_t> &devices,
bool use_sub_devices);
void run_read_registers_test(std::vector<ze_device_handle_t> &devices,
bool use_sub_devices);
};

class zetDebugThreadControlTest : public zetDebugBaseSetup {
Expand Down
1 change: 1 addition & 0 deletions conformance_tests/tools/debug/src/test_debug_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ typedef enum {
ATTACH_AFTER_MODULE_DESTROYED,
LONG_RUNNING_KERNEL_INTERRUPTED,
LONG_RUNNING_KERNEL_INTERRUPTED_SLM,
LONG_RUNNING_KERNEL_INTERRUPTED_SCRATCH,
PAGE_FAULT,
MULTIPLE_THREADS,
MULTIPLE_CQ,
Expand Down
132 changes: 132 additions & 0 deletions conformance_tests/tools/debug/src/test_debug_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,133 @@ void run_long_kernel(ze_context_handle_t context, ze_device_handle_t device,
}
}

void run_long_kernel_scratch(ze_context_handle_t context,
ze_device_handle_t device,
process_synchro &synchro, debug_options &options) {

auto command_list = lzt::create_command_list(device);
auto command_queue = lzt::create_command_queue(device);
std::string module_name = options.module_name_in;

std::string kernel_name = "long_kernel_slm";
size_t slm_buffer_size = 512; // NOTE: Not all SKUs have same SLM so can go too big.

synchro.wait_for_debugger_signal();
const char *build_flags ="-g -igc_opts 'VISAOptions=-forcespills'";
auto module =
lzt::create_module(device, module_name, ZE_MODULE_FORMAT_IL_SPIRV,
build_flags /* include debug symbols*/, nullptr);

auto kernel = lzt::create_function(module, kernel_name);
auto size = slm_buffer_size;

ze_kernel_properties_t kernel_properties = {
ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES, nullptr};
EXPECT_EQ(ZE_RESULT_SUCCESS,
zeKernelGetProperties(kernel, &kernel_properties));
int threadCount = std::ceil(size / kernel_properties.maxSubgroupSize);

LOG_INFO << "[Application] Problem size: " << size
<< ". Kernel maxSubGroupSize: " << kernel_properties.maxSubgroupSize
<< ". GPU thread count: ceil (P size/maxSubGroupSize) = "
<< threadCount;

auto dest_buffer_d =
lzt::allocate_device_memory(size, size, 0, 0, device, context);
auto dest_buffer_s =
lzt::allocate_shared_memory(size, size, 0, 0, device, context);
auto src_buffer_d =
lzt::allocate_device_memory(size, size, 0, 0, device, context);
auto src_buffer_s =
lzt::allocate_shared_memory(size, size, 0, 0, device, context);

void *slm_output_s = nullptr;
slm_output_s = lzt::allocate_shared_memory(slm_buffer_size, slm_buffer_size,
0, 0, device, context);

unsigned long loop_max = 1000000000;

auto loop_counter_d = lzt::allocate_device_memory(
loop_counter_alloc_size, loop_counter_alloc_size, 0, 0, device, context);
auto loop_counter_s = lzt::allocate_shared_memory(
loop_counter_alloc_size, loop_counter_alloc_size, 0, 0, device, context);

LOG_DEBUG << "[Application] Allocated source device memory at: " << std::hex
<< src_buffer_d;
LOG_DEBUG << "[Application] Allocated destination device memory at: "
<< std::hex << dest_buffer_d;

std::memset(dest_buffer_s, 1, size);
std::memset(src_buffer_s, 0, size);
std::memset(loop_counter_s, 0, loop_counter_alloc_size);
for (size_t i = 0; i < size; i++) {
static_cast<uint8_t *>(src_buffer_s)[i] = (i + 1 & 0xFF);
}

lzt::set_argument_value(kernel, 0, sizeof(dest_buffer_d), &dest_buffer_d);
lzt::set_argument_value(kernel, 1, sizeof(src_buffer_d), &src_buffer_d);
lzt::set_argument_value(kernel, 2, sizeof(loop_counter_d), &loop_counter_d);
lzt::set_argument_value(kernel, 3, sizeof(loop_max), &loop_max);
lzt::set_argument_value(kernel, 4, sizeof(slm_output_s), &slm_output_s);

uint32_t group_size_x = 1;
uint32_t group_size_y = 1;
uint32_t group_size_z = 1;
lzt::suggest_group_size(kernel, size, 1, 1, group_size_x, group_size_y,
group_size_z);
lzt::set_group_size(kernel, group_size_x, 1, 1);
ze_group_count_t group_count = {};
group_count.groupCountX = size / group_size_x;
group_count.groupCountY = 1;
group_count.groupCountZ = 1;

lzt::append_memory_copy(command_list, src_buffer_d, src_buffer_s, size);
lzt::append_barrier(command_list);
lzt::append_launch_function(command_list, kernel, &group_count, nullptr, 0,
nullptr);
lzt::append_barrier(command_list);
lzt::append_memory_copy(command_list, dest_buffer_s, dest_buffer_d, size);
lzt::append_memory_copy(command_list, loop_counter_s, loop_counter_d,
loop_counter_alloc_size);
lzt::close_command_list(command_list);

LOG_DEBUG << "[Application] launching execution of " << kernel_name;

synchro.update_gpu_buffer_address(reinterpret_cast<uint64_t>(src_buffer_d));
synchro.notify_debugger();

lzt::execute_command_lists(command_queue, 1, &command_list, nullptr);
lzt::synchronize(command_queue, UINT64_MAX);

for (size_t i = 1; i < size; i++) {
EXPECT_EQ(static_cast<uint8_t *>(dest_buffer_s)[i],
static_cast<uint8_t *>(src_buffer_s)[i]);
if (static_cast<uint8_t *>(dest_buffer_s)[i] !=
static_cast<uint8_t *>(src_buffer_s)[i]) {
LOG_ERROR << "[Application] Buffer Sanity check did not pass";
break;
}
}

// cleanup
lzt::free_memory(context, dest_buffer_s);
lzt::free_memory(context, dest_buffer_d);
lzt::free_memory(context, src_buffer_s);
lzt::free_memory(context, src_buffer_d);
lzt::free_memory(context, loop_counter_s);
lzt::free_memory(context, loop_counter_d);
lzt::free_memory(context, slm_output_s);

lzt::destroy_function(kernel);
lzt::destroy_module(module);
lzt::destroy_command_list(command_list);
lzt::destroy_command_queue(command_queue);

if (::testing::Test::HasFailure()) {
exit(1);
}
}

void run_multiple_threads(ze_context_handle_t context,
ze_device_handle_t device, process_synchro &synchro,
debug_options &options) {
Expand Down Expand Up @@ -1227,6 +1354,11 @@ int main(int argc, char **argv) {
options.kernel_name_in = "long_kernel_slm";
run_long_kernel(context, device, synchro, options);
break;
case LONG_RUNNING_KERNEL_INTERRUPTED_SCRATCH:
options.use_custom_module = true;
options.module_name_in = "debug_loop_slm.spv";
run_long_kernel_scratch(context, device, synchro, options);
break;
case MULTIPLE_THREADS:
run_multiple_threads(context, device, synchro, options);
break;
Expand Down

0 comments on commit 4697059

Please sign in to comment.