diff --git a/conformance_tests/tools/debug/src/test_debug.cpp b/conformance_tests/tools/debug/src/test_debug.cpp index 47335878..363a15d1 100644 --- a/conformance_tests/tools/debug/src/test_debug.cpp +++ b/conformance_tests/tools/debug/src/test_debug.cpp @@ -8,6 +8,7 @@ #include "test_debug.hpp" #include "test_debug_utils.hpp" +#include "test_harness/zet_intel_gpu_debug.h" namespace lzt = level_zero_tests; @@ -1376,6 +1377,128 @@ void zetDebugReadWriteRegistersTest::run_read_write_registers_test( } } +void zetDebugReadWriteRegistersTest::run_read_registers_test( + std::vector &devices, bool use_sub_devices) { + for (auto &device : devices) { + print_device(device); + if (!is_debug_supported(device)) + continue; + + synchro->clear_debugger_signal(); + debugHelper = launch_process(LONG_RUNNING_KERNEL_INTERRUPTED_SCRATCH, + device, use_sub_devices); + + zet_debug_event_t module_event; + attach_and_get_module_event(debugHelper.id(), synchro, device, debugSession, + module_event); + + if (module_event.flags & ZET_DEBUG_EVENT_FLAG_NEED_ACK) { + LOG_DEBUG << "[Debugger] Acking event: " + << lzt::debuggerEventTypeString[module_event.type]; + lzt::debug_ack_event(debugSession, &module_event); + } + + uint64_t gpu_buffer_va = 0; + synchro->wait_for_application_signal(); + if (!synchro->get_app_gpu_buffer_address(gpu_buffer_va)) { + FAIL() << "[Debugger] Could not get a valid GPU buffer VA"; + } + synchro->clear_application_signal(); + + zet_debug_memory_space_desc_t memorySpaceDesc; + memorySpaceDesc.type = ZET_DEBUG_MEMORY_SPACE_TYPE_DEFAULT; + int sizeToRead = 512; + uint8_t *kernel_buffer = new uint8_t[sizeToRead]; + // set buffer[0] to 0 to break the loop. See debug_loop_slm.cl + kernel_buffer[0] = 0; + memorySpaceDesc.address = gpu_buffer_va; + + ze_device_thread_t device_threads = {}; + device_threads.slice = UINT32_MAX; + device_threads.subslice = UINT32_MAX; + device_threads.eu = UINT32_MAX; + device_threads.thread = UINT32_MAX; + + LOG_INFO << "[Debugger] Stopping all device threads"; + // give time to app to launch the kernel + std::this_thread::sleep_for(std::chrono::seconds(6)); + lzt::debug_interrupt(debugSession, device_threads); + + std::vector stopped_threads; + if (!find_stopped_threads(debugSession, device, device_threads, true, + stopped_threads)) { + delete[] kernel_buffer; + FAIL() << "[Debugger] Did not find stopped threads"; + } + + LOG_INFO << "[Debugger] Reading/Writing Thread Scratch Register on " + "interrupted threads"; + + for (auto &stopped_thread : stopped_threads) { + std::vector register_set_properties = + lzt::get_register_set_properties(device); + if (lzt::is_heapless_mode(stopped_thread, device, debugSession)) { + for (auto ®ister_set : register_set_properties) { + if ((register_set.type == + ZET_DEBUG_REGSET_TYPE_THREAD_SCRATCH_INTEL_GPU) && + (register_set.generalFlags & ZET_DEBUG_REGSET_FLAG_READABLE)) { + LOG_DEBUG << "[Debugger] Register set type " << register_set.type + << " is readable"; + size_t reg_size_in_bytes = + register_set.count * register_set.byteSize; + + uint64_t *thread_scratch_reg_values = + new uint64_t[reg_size_in_bytes]; + ASSERT_EQ(zetDebugReadRegisters( + debugSession, stopped_thread, + ZET_DEBUG_REGSET_TYPE_DEBUG_SCRATCH_INTEL_GPU, 0, + register_set.count, thread_scratch_reg_values), + ZE_RESULT_SUCCESS); + } else { + FAIL() << "[Debugger] Register set type " << register_set.type + << " is NOT readable"; + } + if (register_set.generalFlags & ZET_DEBUG_REGSET_FLAG_WRITEABLE) { + FAIL() << "[Debugger] Register set type " << register_set.type + << " should NOT be Writable"; + } else { + LOG_INFO << "[Debugger] Register set " << register_set.type + << " type is NOT writeable"; + } + } + } else { + GTEST_SKIP() << "Test is not supported on this device"; + } + } + + lzt::debug_write_memory(debugSession, device_threads, memorySpaceDesc, 1, + kernel_buffer); + delete[] kernel_buffer; + + LOG_INFO << "[Debugger] resuming interrupted threads"; + lzt::debug_resume(debugSession, device_threads); + debugHelper.wait(); + + std::vector expectedEvents = { + ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD, ZET_DEBUG_EVENT_TYPE_PROCESS_EXIT}; + + if (!check_events(debugSession, expectedEvents)) { + FAIL() << "[Debugger] Did not receive expected events"; + } + + lzt::debug_detach(debugSession); + ASSERT_EQ(debugHelper.exit_code(), 0); + } +} + +TEST_F( + zetDebugReadWriteRegistersTest, + GivenActiveDebugSessionWhenReadingScratchRegistersThenDataReadIsDoneSuccessfully) { + auto driver = lzt::get_default_driver(); + auto devices = lzt::get_devices(driver); + run_read_registers_test(devices, false); +} + TEST_F( zetDebugReadWriteRegistersTest, GivenActiveDebugSessionWhenReadingAndWritingRegistersThenValidDataReadAndDataWrittenSuccessfully) { diff --git a/conformance_tests/tools/debug/src/test_debug.hpp b/conformance_tests/tools/debug/src/test_debug.hpp index c3ad19ec..b5c80491 100644 --- a/conformance_tests/tools/debug/src/test_debug.hpp +++ b/conformance_tests/tools/debug/src/test_debug.hpp @@ -255,6 +255,8 @@ class zetDebugReadWriteRegistersTest : public zetDebugMemAccessTest { void TearDown() override { zetDebugMemAccessTest::TearDown(); } void run_read_write_registers_test(std::vector &devices, bool use_sub_devices); + void run_read_registers_test(std::vector &devices, + bool use_sub_devices); }; class zetDebugThreadControlTest : public zetDebugBaseSetup { diff --git a/conformance_tests/tools/debug/src/test_debug_common.hpp b/conformance_tests/tools/debug/src/test_debug_common.hpp index 52c0aee3..124a83b2 100644 --- a/conformance_tests/tools/debug/src/test_debug_common.hpp +++ b/conformance_tests/tools/debug/src/test_debug_common.hpp @@ -48,6 +48,7 @@ typedef enum { ATTACH_AFTER_MODULE_DESTROYED, LONG_RUNNING_KERNEL_INTERRUPTED, LONG_RUNNING_KERNEL_INTERRUPTED_SLM, + LONG_RUNNING_KERNEL_INTERRUPTED_SCRATCH, PAGE_FAULT, MULTIPLE_THREADS, MULTIPLE_CQ, diff --git a/conformance_tests/tools/debug/src/test_debug_helper.cpp b/conformance_tests/tools/debug/src/test_debug_helper.cpp index c1dbc117..b565ce28 100644 --- a/conformance_tests/tools/debug/src/test_debug_helper.cpp +++ b/conformance_tests/tools/debug/src/test_debug_helper.cpp @@ -583,6 +583,133 @@ void run_long_kernel(ze_context_handle_t context, ze_device_handle_t device, } } +void run_long_kernel_scratch(ze_context_handle_t context, + ze_device_handle_t device, + process_synchro &synchro, debug_options &options) { + + auto command_list = lzt::create_command_list(device); + auto command_queue = lzt::create_command_queue(device); + std::string module_name = options.module_name_in; + + std::string kernel_name = "long_kernel_slm"; + size_t slm_buffer_size = 512; // NOTE: Not all SKUs have same SLM so can go too big. + + synchro.wait_for_debugger_signal(); + const char *build_flags ="-g -igc_opts 'VISAOptions=-forcespills'"; + auto module = + lzt::create_module(device, module_name, ZE_MODULE_FORMAT_IL_SPIRV, + build_flags /* include debug symbols*/, nullptr); + + auto kernel = lzt::create_function(module, kernel_name); + auto size = slm_buffer_size; + + ze_kernel_properties_t kernel_properties = { + ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES, nullptr}; + EXPECT_EQ(ZE_RESULT_SUCCESS, + zeKernelGetProperties(kernel, &kernel_properties)); + int threadCount = std::ceil(size / kernel_properties.maxSubgroupSize); + + LOG_INFO << "[Application] Problem size: " << size + << ". Kernel maxSubGroupSize: " << kernel_properties.maxSubgroupSize + << ". GPU thread count: ceil (P size/maxSubGroupSize) = " + << threadCount; + + auto dest_buffer_d = + lzt::allocate_device_memory(size, size, 0, 0, device, context); + auto dest_buffer_s = + lzt::allocate_shared_memory(size, size, 0, 0, device, context); + auto src_buffer_d = + lzt::allocate_device_memory(size, size, 0, 0, device, context); + auto src_buffer_s = + lzt::allocate_shared_memory(size, size, 0, 0, device, context); + + void *slm_output_s = nullptr; + slm_output_s = lzt::allocate_shared_memory(slm_buffer_size, slm_buffer_size, + 0, 0, device, context); + + unsigned long loop_max = 1000000000; + + auto loop_counter_d = lzt::allocate_device_memory( + loop_counter_alloc_size, loop_counter_alloc_size, 0, 0, device, context); + auto loop_counter_s = lzt::allocate_shared_memory( + loop_counter_alloc_size, loop_counter_alloc_size, 0, 0, device, context); + + LOG_DEBUG << "[Application] Allocated source device memory at: " << std::hex + << src_buffer_d; + LOG_DEBUG << "[Application] Allocated destination device memory at: " + << std::hex << dest_buffer_d; + + std::memset(dest_buffer_s, 1, size); + std::memset(src_buffer_s, 0, size); + std::memset(loop_counter_s, 0, loop_counter_alloc_size); + for (size_t i = 0; i < size; i++) { + static_cast(src_buffer_s)[i] = (i + 1 & 0xFF); + } + + lzt::set_argument_value(kernel, 0, sizeof(dest_buffer_d), &dest_buffer_d); + lzt::set_argument_value(kernel, 1, sizeof(src_buffer_d), &src_buffer_d); + lzt::set_argument_value(kernel, 2, sizeof(loop_counter_d), &loop_counter_d); + lzt::set_argument_value(kernel, 3, sizeof(loop_max), &loop_max); + lzt::set_argument_value(kernel, 4, sizeof(slm_output_s), &slm_output_s); + + uint32_t group_size_x = 1; + uint32_t group_size_y = 1; + uint32_t group_size_z = 1; + lzt::suggest_group_size(kernel, size, 1, 1, group_size_x, group_size_y, + group_size_z); + lzt::set_group_size(kernel, group_size_x, 1, 1); + ze_group_count_t group_count = {}; + group_count.groupCountX = size / group_size_x; + group_count.groupCountY = 1; + group_count.groupCountZ = 1; + + lzt::append_memory_copy(command_list, src_buffer_d, src_buffer_s, size); + lzt::append_barrier(command_list); + lzt::append_launch_function(command_list, kernel, &group_count, nullptr, 0, + nullptr); + lzt::append_barrier(command_list); + lzt::append_memory_copy(command_list, dest_buffer_s, dest_buffer_d, size); + lzt::append_memory_copy(command_list, loop_counter_s, loop_counter_d, + loop_counter_alloc_size); + lzt::close_command_list(command_list); + + LOG_DEBUG << "[Application] launching execution of " << kernel_name; + + synchro.update_gpu_buffer_address(reinterpret_cast(src_buffer_d)); + synchro.notify_debugger(); + + lzt::execute_command_lists(command_queue, 1, &command_list, nullptr); + lzt::synchronize(command_queue, UINT64_MAX); + + for (size_t i = 1; i < size; i++) { + EXPECT_EQ(static_cast(dest_buffer_s)[i], + static_cast(src_buffer_s)[i]); + if (static_cast(dest_buffer_s)[i] != + static_cast(src_buffer_s)[i]) { + LOG_ERROR << "[Application] Buffer Sanity check did not pass"; + break; + } + } + + // cleanup + lzt::free_memory(context, dest_buffer_s); + lzt::free_memory(context, dest_buffer_d); + lzt::free_memory(context, src_buffer_s); + lzt::free_memory(context, src_buffer_d); + lzt::free_memory(context, loop_counter_s); + lzt::free_memory(context, loop_counter_d); + lzt::free_memory(context, slm_output_s); + + lzt::destroy_function(kernel); + lzt::destroy_module(module); + lzt::destroy_command_list(command_list); + lzt::destroy_command_queue(command_queue); + + if (::testing::Test::HasFailure()) { + exit(1); + } +} + void run_multiple_threads(ze_context_handle_t context, ze_device_handle_t device, process_synchro &synchro, debug_options &options) { @@ -1227,6 +1354,11 @@ int main(int argc, char **argv) { options.kernel_name_in = "long_kernel_slm"; run_long_kernel(context, device, synchro, options); break; + case LONG_RUNNING_KERNEL_INTERRUPTED_SCRATCH: + options.use_custom_module = true; + options.module_name_in = "debug_loop_slm.spv"; + run_long_kernel_scratch(context, device, synchro, options); + break; case MULTIPLE_THREADS: run_multiple_threads(context, device, synchro, options); break; diff --git a/utils/test_harness/tools/include/test_harness_debug.hpp b/utils/test_harness/tools/include/test_harness_debug.hpp index 483bcb53..9f3941a0 100644 --- a/utils/test_harness/tools/include/test_harness_debug.hpp +++ b/utils/test_harness/tools/include/test_harness_debug.hpp @@ -43,8 +43,8 @@ void debug_resume(const zet_debug_session_handle_t &debug_session, const ze_device_thread_t &device_thread); void clear_exceptions(const ze_device_handle_t &device, - const zet_debug_session_handle_t &debug_session, - const ze_device_thread_t &device_thread); + const zet_debug_session_handle_t &debug_session, + const ze_device_thread_t &device_thread); void debug_read_memory(const zet_debug_session_handle_t &debug_session, const ze_device_thread_t &device_thread, @@ -79,6 +79,10 @@ void debug_write_registers(const zet_debug_session_handle_t &debug_session, std::vector get_debug_info(const zet_module_handle_t &module); +bool is_heapless_mode(ze_device_thread_t stopped_thread, + ze_device_handle_t &device_handle, + zet_debug_session_handle_t debug_session); + }; // namespace level_zero_tests #endif /* TEST_HARNESS_DEBUG_HPP */ diff --git a/utils/test_harness/tools/src/test_harness_debug.cpp b/utils/test_harness/tools/src/test_harness_debug.cpp index ed1838b8..0fe87fc0 100644 --- a/utils/test_harness/tools/src/test_harness_debug.cpp +++ b/utils/test_harness/tools/src/test_harness_debug.cpp @@ -181,8 +181,8 @@ bool get_register_set_props(ze_device_handle_t device, } void clear_exceptions(const ze_device_handle_t &device, - const zet_debug_session_handle_t &debug_session, - const ze_device_thread_t &device_thread) { + const zet_debug_session_handle_t &debug_session, + const ze_device_thread_t &device_thread) { size_t reg_size_in_bytes = 0; zet_debug_regset_properties_t cr_reg_prop; @@ -198,8 +198,7 @@ void clear_exceptions(const ze_device_handle_t &device, cr_reg_prop.count, cr_values), ZE_RESULT_SUCCESS); - uint32_values[1] &= - ~((1 << 26) | (1 << 30)); + uint32_values[1] &= ~((1 << 26) | (1 << 30)); ASSERT_EQ(zetDebugWriteRegisters(debug_session, device_thread, ZET_DEBUG_REGSET_TYPE_CR_INTEL_GPU, 0, cr_reg_prop.count, cr_values), @@ -305,4 +304,31 @@ std::vector get_debug_info(const zet_module_handle_t &module_handle) { return debug_info; } +bool is_heapless_mode(ze_device_thread_t stopped_thread, + ze_device_handle_t &device_handle, + zet_debug_session_handle_t debug_session) { + + uint8_t *mode_values = nullptr; + bool result = false; + std::vector regset_properties = + lzt::get_register_set_properties(device_handle); + for (auto ®ister_set : regset_properties) { + if (register_set.type == ZET_DEBUG_REGSET_TYPE_MODE_FLAGS_INTEL_GPU) { + auto reg_size_in_bytes = register_set.count * register_set.byteSize; + mode_values = new uint8_t[reg_size_in_bytes]; + EXPECT_EQ( + zetDebugReadRegisters(debug_session, stopped_thread, + ZET_DEBUG_REGSET_TYPE_MODE_FLAGS_INTEL_GPU, 0, + register_set.count, mode_values), + ZE_RESULT_SUCCESS); + + uint32_t *uint32_t_values = (uint32_t *)mode_values; + LOG_DEBUG << "[Debugger] mode value: %u " << uint32_t_values[0]; + result = (uint32_t_values[0] & ZET_DEBUG_MODE_FLAG_HEAPLESS); + } + } + + return result; +} + } // namespace level_zero_tests