Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CTS for scratch register reading. #89

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions conformance_tests/tools/debug/src/test_debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "test_debug.hpp"
#include "test_debug_utils.hpp"
#include "test_harness/zet_intel_gpu_debug.h"

namespace lzt = level_zero_tests;

Expand Down Expand Up @@ -1376,6 +1377,128 @@ void zetDebugReadWriteRegistersTest::run_read_write_registers_test(
}
}

void zetDebugReadWriteRegistersTest::run_read_registers_test(
std::vector<ze_device_handle_t> &devices, bool use_sub_devices) {
for (auto &device : devices) {
print_device(device);
if (!is_debug_supported(device))
continue;

synchro->clear_debugger_signal();
debugHelper = launch_process(LONG_RUNNING_KERNEL_INTERRUPTED_SCRATCH,
device, use_sub_devices);

zet_debug_event_t module_event;
attach_and_get_module_event(debugHelper.id(), synchro, device, debugSession,
module_event);

if (module_event.flags & ZET_DEBUG_EVENT_FLAG_NEED_ACK) {
LOG_DEBUG << "[Debugger] Acking event: "
<< lzt::debuggerEventTypeString[module_event.type];
lzt::debug_ack_event(debugSession, &module_event);
}

uint64_t gpu_buffer_va = 0;
synchro->wait_for_application_signal();
if (!synchro->get_app_gpu_buffer_address(gpu_buffer_va)) {
FAIL() << "[Debugger] Could not get a valid GPU buffer VA";
}
synchro->clear_application_signal();

zet_debug_memory_space_desc_t memorySpaceDesc;
memorySpaceDesc.type = ZET_DEBUG_MEMORY_SPACE_TYPE_DEFAULT;
int sizeToRead = 512;
uint8_t *kernel_buffer = new uint8_t[sizeToRead];
// set buffer[0] to 0 to break the loop. See debug_loop_slm.cl
kernel_buffer[0] = 0;
memorySpaceDesc.address = gpu_buffer_va;

ze_device_thread_t device_threads = {};
device_threads.slice = UINT32_MAX;
device_threads.subslice = UINT32_MAX;
device_threads.eu = UINT32_MAX;
device_threads.thread = UINT32_MAX;

LOG_INFO << "[Debugger] Stopping all device threads";
// give time to app to launch the kernel
std::this_thread::sleep_for(std::chrono::seconds(6));
lzt::debug_interrupt(debugSession, device_threads);

std::vector<ze_device_thread_t> stopped_threads;
if (!find_stopped_threads(debugSession, device, device_threads, true,
stopped_threads)) {
delete[] kernel_buffer;
FAIL() << "[Debugger] Did not find stopped threads";
}

LOG_INFO << "[Debugger] Reading/Writing Thread Scratch Register on "
"interrupted threads";

for (auto &stopped_thread : stopped_threads) {
std::vector<zet_debug_regset_properties_t> register_set_properties =
lzt::get_register_set_properties(device);
if (lzt::is_heapless_mode(stopped_thread, device, debugSession)) {
for (auto &register_set : register_set_properties) {
if ((register_set.type ==
ZET_DEBUG_REGSET_TYPE_THREAD_SCRATCH_INTEL_GPU) &&
(register_set.generalFlags & ZET_DEBUG_REGSET_FLAG_READABLE)) {
LOG_DEBUG << "[Debugger] Register set type " << register_set.type
<< " is readable";
size_t reg_size_in_bytes =
register_set.count * register_set.byteSize;

uint64_t *thread_scratch_reg_values =
new uint64_t[reg_size_in_bytes];
ASSERT_EQ(zetDebugReadRegisters(
debugSession, stopped_thread,
ZET_DEBUG_REGSET_TYPE_DEBUG_SCRATCH_INTEL_GPU, 0,
register_set.count, thread_scratch_reg_values),
ZE_RESULT_SUCCESS);
} else {
FAIL() << "[Debugger] Register set type " << register_set.type
<< " is NOT readable";
}
if (register_set.generalFlags & ZET_DEBUG_REGSET_FLAG_WRITEABLE) {
aviralni marked this conversation as resolved.
Show resolved Hide resolved
FAIL() << "[Debugger] Register set type " << register_set.type
<< " should NOT be Writable";
} else {
LOG_INFO << "[Debugger] Register set " << register_set.type
<< " type is NOT writeable";
}
}
} else {
GTEST_SKIP() << "Test is not supported on this device";
}
}

lzt::debug_write_memory(debugSession, device_threads, memorySpaceDesc, 1,
kernel_buffer);
delete[] kernel_buffer;

LOG_INFO << "[Debugger] resuming interrupted threads";
lzt::debug_resume(debugSession, device_threads);
debugHelper.wait();

std::vector<zet_debug_event_type_t> expectedEvents = {
ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD, ZET_DEBUG_EVENT_TYPE_PROCESS_EXIT};

if (!check_events(debugSession, expectedEvents)) {
FAIL() << "[Debugger] Did not receive expected events";
}

lzt::debug_detach(debugSession);
ASSERT_EQ(debugHelper.exit_code(), 0);
}
}

TEST_F(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Query why cant we use the same test case which @bmyates mentioned in our sync?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then we might need to modify the kernel too. Kernel is not allocating any local memory as such.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any difficulty in modifing the kernel? I hope scratch is forced using IGC_OPTs, so maybe with these options kernel modification might not be required

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes please try to use the same test. The kernel should not need to allocate any local memory. Local memory is for SLM - scratch is different. The compiler will determine if scratch is needed or not based on register usage.

Try to use same kernel. If scratch isnt allocated then we can try to use IGC opt. If we still are not getting any spills I can give some suggestions to modify kernel

zetDebugReadWriteRegistersTest,
GivenActiveDebugSessionWhenReadingScratchRegistersThenDataReadIsDoneSuccessfully) {
auto driver = lzt::get_default_driver();
auto devices = lzt::get_devices(driver);
run_read_registers_test(devices, false);
}

TEST_F(
zetDebugReadWriteRegistersTest,
GivenActiveDebugSessionWhenReadingAndWritingRegistersThenValidDataReadAndDataWrittenSuccessfully) {
Expand Down
2 changes: 2 additions & 0 deletions conformance_tests/tools/debug/src/test_debug.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ class zetDebugReadWriteRegistersTest : public zetDebugMemAccessTest {
void TearDown() override { zetDebugMemAccessTest::TearDown(); }
void run_read_write_registers_test(std::vector<ze_device_handle_t> &devices,
bool use_sub_devices);
void run_read_registers_test(std::vector<ze_device_handle_t> &devices,
bool use_sub_devices);
};

class zetDebugThreadControlTest : public zetDebugBaseSetup {
Expand Down
1 change: 1 addition & 0 deletions conformance_tests/tools/debug/src/test_debug_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ typedef enum {
ATTACH_AFTER_MODULE_DESTROYED,
LONG_RUNNING_KERNEL_INTERRUPTED,
LONG_RUNNING_KERNEL_INTERRUPTED_SLM,
LONG_RUNNING_KERNEL_INTERRUPTED_SCRATCH,
PAGE_FAULT,
MULTIPLE_THREADS,
MULTIPLE_CQ,
Expand Down
132 changes: 132 additions & 0 deletions conformance_tests/tools/debug/src/test_debug_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,133 @@ void run_long_kernel(ze_context_handle_t context, ze_device_handle_t device,
}
}

void run_long_kernel_scratch(ze_context_handle_t context,
ze_device_handle_t device,
process_synchro &synchro, debug_options &options) {

auto command_list = lzt::create_command_list(device);
auto command_queue = lzt::create_command_queue(device);
std::string module_name = options.module_name_in;

std::string kernel_name = "long_kernel_slm";
size_t slm_buffer_size = 512; // NOTE: Not all SKUs have same SLM so can go too big.

synchro.wait_for_debugger_signal();
const char *build_flags ="-g -igc_opts 'VISAOptions=-forcespills'";
auto module =
lzt::create_module(device, module_name, ZE_MODULE_FORMAT_IL_SPIRV,
build_flags /* include debug symbols*/, nullptr);

auto kernel = lzt::create_function(module, kernel_name);
auto size = slm_buffer_size;

ze_kernel_properties_t kernel_properties = {
ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES, nullptr};
EXPECT_EQ(ZE_RESULT_SUCCESS,
zeKernelGetProperties(kernel, &kernel_properties));
int threadCount = std::ceil(size / kernel_properties.maxSubgroupSize);

LOG_INFO << "[Application] Problem size: " << size
<< ". Kernel maxSubGroupSize: " << kernel_properties.maxSubgroupSize
<< ". GPU thread count: ceil (P size/maxSubGroupSize) = "
<< threadCount;

auto dest_buffer_d =
lzt::allocate_device_memory(size, size, 0, 0, device, context);
auto dest_buffer_s =
lzt::allocate_shared_memory(size, size, 0, 0, device, context);
auto src_buffer_d =
lzt::allocate_device_memory(size, size, 0, 0, device, context);
auto src_buffer_s =
lzt::allocate_shared_memory(size, size, 0, 0, device, context);

void *slm_output_s = nullptr;
slm_output_s = lzt::allocate_shared_memory(slm_buffer_size, slm_buffer_size,
0, 0, device, context);

unsigned long loop_max = 1000000000;

auto loop_counter_d = lzt::allocate_device_memory(
loop_counter_alloc_size, loop_counter_alloc_size, 0, 0, device, context);
auto loop_counter_s = lzt::allocate_shared_memory(
loop_counter_alloc_size, loop_counter_alloc_size, 0, 0, device, context);

LOG_DEBUG << "[Application] Allocated source device memory at: " << std::hex
<< src_buffer_d;
LOG_DEBUG << "[Application] Allocated destination device memory at: "
<< std::hex << dest_buffer_d;

std::memset(dest_buffer_s, 1, size);
std::memset(src_buffer_s, 0, size);
std::memset(loop_counter_s, 0, loop_counter_alloc_size);
for (size_t i = 0; i < size; i++) {
static_cast<uint8_t *>(src_buffer_s)[i] = (i + 1 & 0xFF);
}

lzt::set_argument_value(kernel, 0, sizeof(dest_buffer_d), &dest_buffer_d);
lzt::set_argument_value(kernel, 1, sizeof(src_buffer_d), &src_buffer_d);
lzt::set_argument_value(kernel, 2, sizeof(loop_counter_d), &loop_counter_d);
lzt::set_argument_value(kernel, 3, sizeof(loop_max), &loop_max);
lzt::set_argument_value(kernel, 4, sizeof(slm_output_s), &slm_output_s);

uint32_t group_size_x = 1;
uint32_t group_size_y = 1;
uint32_t group_size_z = 1;
lzt::suggest_group_size(kernel, size, 1, 1, group_size_x, group_size_y,
group_size_z);
lzt::set_group_size(kernel, group_size_x, 1, 1);
ze_group_count_t group_count = {};
group_count.groupCountX = size / group_size_x;
group_count.groupCountY = 1;
group_count.groupCountZ = 1;

lzt::append_memory_copy(command_list, src_buffer_d, src_buffer_s, size);
lzt::append_barrier(command_list);
lzt::append_launch_function(command_list, kernel, &group_count, nullptr, 0,
nullptr);
lzt::append_barrier(command_list);
lzt::append_memory_copy(command_list, dest_buffer_s, dest_buffer_d, size);
lzt::append_memory_copy(command_list, loop_counter_s, loop_counter_d,
loop_counter_alloc_size);
lzt::close_command_list(command_list);

LOG_DEBUG << "[Application] launching execution of " << kernel_name;

synchro.update_gpu_buffer_address(reinterpret_cast<uint64_t>(src_buffer_d));
synchro.notify_debugger();

lzt::execute_command_lists(command_queue, 1, &command_list, nullptr);
lzt::synchronize(command_queue, UINT64_MAX);

for (size_t i = 1; i < size; i++) {
EXPECT_EQ(static_cast<uint8_t *>(dest_buffer_s)[i],
static_cast<uint8_t *>(src_buffer_s)[i]);
if (static_cast<uint8_t *>(dest_buffer_s)[i] !=
static_cast<uint8_t *>(src_buffer_s)[i]) {
LOG_ERROR << "[Application] Buffer Sanity check did not pass";
break;
}
}

// cleanup
lzt::free_memory(context, dest_buffer_s);
lzt::free_memory(context, dest_buffer_d);
lzt::free_memory(context, src_buffer_s);
lzt::free_memory(context, src_buffer_d);
lzt::free_memory(context, loop_counter_s);
lzt::free_memory(context, loop_counter_d);
lzt::free_memory(context, slm_output_s);

lzt::destroy_function(kernel);
lzt::destroy_module(module);
lzt::destroy_command_list(command_list);
lzt::destroy_command_queue(command_queue);

if (::testing::Test::HasFailure()) {
exit(1);
}
}

void run_multiple_threads(ze_context_handle_t context,
ze_device_handle_t device, process_synchro &synchro,
debug_options &options) {
Expand Down Expand Up @@ -1227,6 +1354,11 @@ int main(int argc, char **argv) {
options.kernel_name_in = "long_kernel_slm";
run_long_kernel(context, device, synchro, options);
break;
case LONG_RUNNING_KERNEL_INTERRUPTED_SCRATCH:
options.use_custom_module = true;
options.module_name_in = "debug_loop_slm.spv";
run_long_kernel_scratch(context, device, synchro, options);
break;
case MULTIPLE_THREADS:
run_multiple_threads(context, device, synchro, options);
break;
Expand Down
8 changes: 6 additions & 2 deletions utils/test_harness/tools/include/test_harness_debug.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ void debug_resume(const zet_debug_session_handle_t &debug_session,
const ze_device_thread_t &device_thread);

void clear_exceptions(const ze_device_handle_t &device,
const zet_debug_session_handle_t &debug_session,
const ze_device_thread_t &device_thread);
const zet_debug_session_handle_t &debug_session,
const ze_device_thread_t &device_thread);

void debug_read_memory(const zet_debug_session_handle_t &debug_session,
const ze_device_thread_t &device_thread,
Expand Down Expand Up @@ -79,6 +79,10 @@ void debug_write_registers(const zet_debug_session_handle_t &debug_session,

std::vector<uint8_t> get_debug_info(const zet_module_handle_t &module);

bool is_heapless_mode(ze_device_thread_t stopped_thread,
ze_device_handle_t &device_handle,
zet_debug_session_handle_t debug_session);

}; // namespace level_zero_tests

#endif /* TEST_HARNESS_DEBUG_HPP */
34 changes: 30 additions & 4 deletions utils/test_harness/tools/src/test_harness_debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ bool get_register_set_props(ze_device_handle_t device,
}

void clear_exceptions(const ze_device_handle_t &device,
const zet_debug_session_handle_t &debug_session,
const ze_device_thread_t &device_thread) {
const zet_debug_session_handle_t &debug_session,
const ze_device_thread_t &device_thread) {
size_t reg_size_in_bytes = 0;

zet_debug_regset_properties_t cr_reg_prop;
Expand All @@ -198,8 +198,7 @@ void clear_exceptions(const ze_device_handle_t &device,
cr_reg_prop.count, cr_values),
ZE_RESULT_SUCCESS);

uint32_values[1] &=
~((1 << 26) | (1 << 30));
uint32_values[1] &= ~((1 << 26) | (1 << 30));
ASSERT_EQ(zetDebugWriteRegisters(debug_session, device_thread,
ZET_DEBUG_REGSET_TYPE_CR_INTEL_GPU, 0,
cr_reg_prop.count, cr_values),
Expand Down Expand Up @@ -305,4 +304,31 @@ std::vector<uint8_t> get_debug_info(const zet_module_handle_t &module_handle) {
return debug_info;
}

bool is_heapless_mode(ze_device_thread_t stopped_thread,
ze_device_handle_t &device_handle,
zet_debug_session_handle_t debug_session) {

uint8_t *mode_values = nullptr;
bool result = false;
std::vector<zet_debug_regset_properties_t> regset_properties =
lzt::get_register_set_properties(device_handle);
for (auto &register_set : regset_properties) {
if (register_set.type == ZET_DEBUG_REGSET_TYPE_MODE_FLAGS_INTEL_GPU) {
auto reg_size_in_bytes = register_set.count * register_set.byteSize;
mode_values = new uint8_t[reg_size_in_bytes];
EXPECT_EQ(
zetDebugReadRegisters(debug_session, stopped_thread,
ZET_DEBUG_REGSET_TYPE_MODE_FLAGS_INTEL_GPU, 0,
register_set.count, mode_values),
ZE_RESULT_SUCCESS);

uint32_t *uint32_t_values = (uint32_t *)mode_values;
LOG_DEBUG << "[Debugger] mode value: %u " << uint32_t_values[0];
result = (uint32_t_values[0] & ZET_DEBUG_MODE_FLAG_HEAPLESS);
}
}

return result;
}

} // namespace level_zero_tests
Loading