diff --git a/.gitignore b/.gitignore index dc233fa..6b5aa69 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,10 @@ **/build **/__pycache__ +build*/ +venv/ +.vscode/ +.vim/ +.cache/ +compile_commands.json +CMakeUserPresets.json +error_diff.txt diff --git a/VERSION b/VERSION index d690236..bfbc8c5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.49.19 +0.49.20 diff --git a/samples/dpc_gemm/CMakeLists.txt b/samples/dpc_gemm/CMakeLists.txt index 55e3514..9cd26a7 100644 --- a/samples/dpc_gemm/CMakeLists.txt +++ b/samples/dpc_gemm/CMakeLists.txt @@ -6,19 +6,23 @@ if(WIN32) set(CMAKE_CXX_COMPILER "dpcpp-cl.exe") set(CMAKE_GENERATOR_TOOLSET "Intel(R) oneAPI DPC++ Compiler") else() - set(CMAKE_CXX_COMPILER "dpcpp") + set(CMAKE_CXX_COMPILER "icpx") endif() project(PTI_Samples_DPC_GEMM CXX) SetCompilerFlags() SetBuildType() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -gline-tables-only") - add_executable(dpc_gemm main.cc) + +target_compile_options(dpc_gemm PUBLIC -fsycl -gline-tables-only) + +# target_link_options CMake >= 3.13 +set_target_properties(dpc_gemm PROPERTIES LINK_FLAGS "-fsycl -gline-tables-only") + target_include_directories(dpc_gemm PRIVATE "${PROJECT_SOURCE_DIR}/../../utils") if(CMAKE_INCLUDE_PATH) target_include_directories(dpc_gemm PUBLIC "${CMAKE_INCLUDE_PATH}") -endif() \ No newline at end of file +endif() diff --git a/samples/dpc_gemm/main.cc b/samples/dpc_gemm/main.cc index ea2c857..ef1feac 100644 --- a/samples/dpc_gemm/main.cc +++ b/samples/dpc_gemm/main.cc @@ -10,7 +10,7 @@ #include "utils.h" -#include +#include #define A_VALUE 0.128f #define B_VALUE 0.256f @@ -63,9 +63,12 @@ static float RunAndCheck(sycl::queue queue, cgh.parallel_for(sycl::range<2>(size, size), [=](sycl::id<2> id) { - GEMM(a_acc.get_pointer(), - b_acc.get_pointer(), - c_acc.get_pointer(), + auto a_acc_ptr = a_acc.get_multi_ptr(); + auto b_acc_ptr = b_acc.get_multi_ptr(); + auto c_acc_ptr = c_acc.get_multi_ptr(); + GEMM(a_acc_ptr.get(), + b_acc_ptr.get(), + c_acc_ptr.get(), size, id); }); }); @@ -101,11 +104,21 @@ static void Compute(sycl::queue queue, } int main(int argc, char* argv[]) { - sycl::info::device_type device_type = sycl::info::device_type::gpu; - if (argc > 1 && strcmp(argv[1], "cpu") == 0) { - device_type = sycl::info::device_type::cpu; - } else if (argc > 1 && strcmp(argv[1], "host") == 0) { - device_type = sycl::info::device_type::host; + sycl::device dev; + try { + dev = sycl::device(sycl::gpu_selector_v); + if (argc > 1 && strcmp(argv[1], "cpu") == 0) { + dev = sycl::device(sycl::cpu_selector_v); + } else if (argc > 1 && strcmp(argv[1], "host") == 0) { + dev = sycl::device(sycl::default_selector_v); + } + } catch (const sycl::exception& e) { + std::cerr << "Error: Exception caught while executing SYCL " << e.what() << '\n'; + std::cerr << "Unable to select valid sycl device" << '\n'; + return EXIT_FAILURE; + } catch (...) { + std::cerr << "Unable to select valid sycl device" << '\n'; + return EXIT_FAILURE; } unsigned size = 1024; @@ -118,17 +131,8 @@ int main(int argc, char* argv[]) { repeat_count = std::stoul(argv[3]); } - std::unique_ptr selector(nullptr); - if (device_type == sycl::info::device_type::cpu) { - selector.reset(new sycl::cpu_selector); - } else if (device_type == sycl::info::device_type::gpu) { - selector.reset(new sycl::gpu_selector); - } else if (device_type == sycl::info::device_type::host) { - selector.reset(new sycl::host_selector); - } - sycl::property_list prop_list{sycl::property::queue::enable_profiling()}; - sycl::queue queue(*selector.get(), sycl::async_handler{}, prop_list); + sycl::queue queue(dev, sycl::async_handler{}, prop_list); std::cout << "DPC++ Matrix Multiplication (matrix size: " << size << " x " << size << ", repeats " << repeat_count << " times)" << std::endl; @@ -148,4 +152,4 @@ int main(int argc, char* argv[]) { std::cout << "Total execution time: " << time.count() << " sec" << std::endl; return 0; -} \ No newline at end of file +} diff --git a/samples/dpc_info/main.cc b/samples/dpc_info/main.cc index 2710595..787a779 100644 --- a/samples/dpc_info/main.cc +++ b/samples/dpc_info/main.cc @@ -166,7 +166,7 @@ std::ostream& operator<<(std::ostream& out, sycl::aspect sycl_aspect) { break; #endif default: - out << "(sycl_aspect) << ">"; + out << "(sycl_aspect) << ">"; break; } return out; diff --git a/sdk/samples/dlworkloads/main.cpp b/sdk/samples/dlworkloads/main.cpp index e987cf5..42c908e 100644 --- a/sdk/samples/dlworkloads/main.cpp +++ b/sdk/samples/dlworkloads/main.cpp @@ -21,8 +21,7 @@ void PrintUsage() std::cout << "It is a largely simpilified application to demo mixed programming "; std::cout << "on Intel GPU for deep learning (PyTorch&TensorFlow) workloads (ITEX&IPEX) "; std::cout << "with direct dpcpp kernel, onednn, onemkl, onedpl, onemkl, eigen, etc." << std::endl; - std::cout << "IPEX: https://github.com/intel-innersource/frameworks.ai.pytorch.ipex-gpu" << std::endl; - std::cout << "ITEX: https://github.com/intel-innersource/frameworks.ai.infrastructure.intel-extension-for-tensorflow.intel-extension-for-tensorflow" << std::endl; + std::cout << "IPEX: https://github.com/intel/intel-extension-for-pytorch" << std::endl; std::cout << std::endl; std::cout << "The purpose of this application is to provide a basic rough requirement for sycl graph capture mode." << std::endl; std::cout << std::endl; @@ -39,6 +38,9 @@ void PrintUsage() std::cout << std::endl; std::cout << "It is supposed that this application will be updated frequently, so this might be not the latest one." << std::endl; std::cout << std::endl; +#if __LIBSYCL_MAJOR_VERSION >= 7 + std::cerr << "Notice: A portion of this sample was not build. To build the whole sample, revert to older oneAPI release (<= 2023.2.0)" << std::endl; +#endif } void run(sycl::queue *q) diff --git a/sdk/samples/dlworkloads/model_mixedprogramming.cpp b/sdk/samples/dlworkloads/model_mixedprogramming.cpp index cb03833..56be972 100644 --- a/sdk/samples/dlworkloads/model_mixedprogramming.cpp +++ b/sdk/samples/dlworkloads/model_mixedprogramming.cpp @@ -15,10 +15,15 @@ TinyTensor run_model_mixedprogramming(TinyTensor inp, sycl::queue *q) TinyTensor outp = run_syclkernel_operation_scaledown(inp, q); GlobalDeviceMemoryManager().free(inp.data); + // TODO(matthew.schilling@intel.com): Fails when run with XPTI tracing. We + // need to figure out a way to uncomment this. It crashes PTI-SDK and + // Unitrace built with OneAPI/ICPX >= 2024.0.0 . // the next operation uses oneDNN for conv2d +#if __LIBSYCL_MAJOR_VERSION < 7 inp = outp; outp = run_onednn_operation_conv2d(inp, q); GlobalDeviceMemoryManager().free(inp.data); +#endif // next operation uses oneMKL inp = outp; diff --git a/sdk/samples/dpc_gemm/main.cc b/sdk/samples/dpc_gemm/main.cc index 2fb0ae0..65155cd 100644 --- a/sdk/samples/dpc_gemm/main.cc +++ b/sdk/samples/dpc_gemm/main.cc @@ -80,8 +80,10 @@ static float RunAndCheck(sycl::queue queue, const std::vector &a, cgh.parallel_for( sycl::range<2>(size, size), [=](sycl::id<2> id) { - GEMM(a_acc.get_pointer(), b_acc.get_pointer(), c_acc.get_pointer(), - size, id); + auto a_acc_ptr = a_acc.get_multi_ptr(); + auto b_acc_ptr = b_acc.get_multi_ptr(); + auto c_acc_ptr = c_acc.get_multi_ptr(); + GEMM(a_acc_ptr.get(), b_acc_ptr.get(), c_acc_ptr.get(), size, id); }); }); queue.wait_and_throw(); diff --git a/sdk/samples/dpc_gemm_threaded/main.cc b/sdk/samples/dpc_gemm_threaded/main.cc index f605057..b82aca3 100644 --- a/sdk/samples/dpc_gemm_threaded/main.cc +++ b/sdk/samples/dpc_gemm_threaded/main.cc @@ -68,8 +68,10 @@ static float RunAndCheck(sycl::queue queue, const std::vector& a, cgh.parallel_for( sycl::range<2>(size, size), [=](sycl::id<2> id) { - GEMM(a_acc.get_pointer(),b_acc.get_pointer(),c_acc.get_pointer(), - size, id); + auto a_acc_ptr = a_acc.get_multi_ptr(); + auto b_acc_ptr = b_acc.get_multi_ptr(); + auto c_acc_ptr = c_acc.get_multi_ptr(); + GEMM(a_acc_ptr.get(), b_acc_ptr.get(), c_acc_ptr.get(), size, id); }); }); queue.wait_and_throw(); diff --git a/sdk/samples/iso3dfd_dpcpp/src/iso3dfd_kernels.cpp b/sdk/samples/iso3dfd_dpcpp/src/iso3dfd_kernels.cpp index 2f412ee..f7566ab 100644 --- a/sdk/samples/iso3dfd_dpcpp/src/iso3dfd_kernels.cpp +++ b/sdk/samples/iso3dfd_dpcpp/src/iso3dfd_kernels.cpp @@ -196,7 +196,7 @@ void Iso3dfdIterationSLM(sycl::nd_item<3> &it, float *next, float *prev, * */ void Iso3dfdIterationGlobal(sycl::nd_item<3> &it, float *next, float *prev, - float *vel, const float *coeff, int nx, int nxy, + const float *vel, const float *coeff, int nx, int nxy, int bx, int by, int z_offset, int full_end_z) { // We compute the start and the end position in the grid // for each work-item. @@ -381,17 +381,17 @@ bool Iso3dfdDevice(sycl::queue &q, float *ptr_next, float *ptr_prev, if (i % 2 == 0) h.parallel_for( nd_range(global_nd_range, local_nd_range), [=](auto it) { - Iso3dfdIterationSLM(it, next.get_pointer(), prev.get_pointer(), - vel.get_pointer(), coeff.get_pointer(), - tab.get_pointer(), nx, nxy, bx, by, + Iso3dfdIterationSLM(it, next.get(), prev.get(), + vel.get(), coeff.get(), + tab.get(), nx, nxy, bx, by, n3_block, end_z); }); else h.parallel_for( nd_range(global_nd_range, local_nd_range), [=](auto it) { - Iso3dfdIterationSLM(it, prev.get_pointer(), next.get_pointer(), - vel.get_pointer(), coeff.get_pointer(), - tab.get_pointer(), nx, nxy, bx, by, + Iso3dfdIterationSLM(it, prev.get(), next.get(), + vel.get(), coeff.get(), + tab.get(), nx, nxy, bx, by, n3_block, end_z); }); @@ -408,17 +408,25 @@ bool Iso3dfdDevice(sycl::queue &q, float *ptr_next, float *ptr_prev, if (i % 2 == 0) h.parallel_for( nd_range(global_nd_range, local_nd_range), [=](auto it) { - Iso3dfdIterationGlobal(it, next.get_pointer(), - prev.get_pointer(), vel.get_pointer(), - coeff.get_pointer(), nx, nxy, bx, by, + auto next_ptr = next.template get_multi_ptr(); + auto prev_ptr = prev.template get_multi_ptr(); + auto vel_ptr = vel.template get_multi_ptr(); + auto coeff_ptr = coeff.template get_multi_ptr(); + Iso3dfdIterationGlobal(it, next_ptr.get(), + prev_ptr.get(), vel_ptr.get(), + coeff_ptr.get(), nx, nxy, bx, by, n3_block, end_z); }); else h.parallel_for( nd_range(global_nd_range, local_nd_range), [=](auto it) { - Iso3dfdIterationGlobal(it, prev.get_pointer(), - next.get_pointer(), vel.get_pointer(), - coeff.get_pointer(), nx, nxy, bx, by, + auto next_ptr = next.template get_multi_ptr(); + auto prev_ptr = prev.template get_multi_ptr(); + auto vel_ptr = vel.template get_multi_ptr(); + auto coeff_ptr = coeff.template get_multi_ptr(); + Iso3dfdIterationGlobal(it, prev_ptr.get(), + next_ptr.get(), vel_ptr.get(), + coeff_ptr.get(), nx, nxy, bx, by, n3_block, end_z); }); #endif diff --git a/sdk/samples/vector_sq_add/vector_sq_add.cpp b/sdk/samples/vector_sq_add/vector_sq_add.cpp index be90716..2403691 100644 --- a/sdk/samples/vector_sq_add/vector_sq_add.cpp +++ b/sdk/samples/vector_sq_add/vector_sq_add.cpp @@ -245,7 +245,7 @@ int main(int argc, char *argv[]) { auto print_queue_info = [](const sycl::queue &sycl_queue) { auto queue_type = get_native(sycl_queue); -#if __LIBSYCL_MAJOR_VERSION >= 6 && __LIBSYCL_MINOR_VERSION >= 2 +#if __LIBSYCL_MAJOR_VERSION > 6 || (__LIBSYCL_MAJOR_VERSION == 6 && __LIBSYCL_MINOR_VERSION >= 2) // 1 (default) if (auto *ptr_queue_handle = std::get_if(&queue_type)) { diff --git a/sdk/test/main_dpcgemm_fixture.cc b/sdk/test/main_dpcgemm_fixture.cc index 0891f71..e29aa96 100644 --- a/sdk/test/main_dpcgemm_fixture.cc +++ b/sdk/test/main_dpcgemm_fixture.cc @@ -101,7 +101,10 @@ float RunAndCheck(sycl::queue queue, const std::vector& a, const std::vec auto c_acc = c_buf.get_access(cgh); cgh.parallel_for(sycl::range<2>(size, size), [=](sycl::id<2> id) { - GEMM(a_acc.get_pointer(), b_acc.get_pointer(), c_acc.get_pointer(), size, id); + auto a_acc_ptr = a_acc.get_multi_ptr(); + auto b_acc_ptr = b_acc.get_multi_ptr(); + auto c_acc_ptr = c_acc.get_multi_ptr(); + GEMM(a_acc_ptr.get(), b_acc_ptr.get(), c_acc_ptr.get(), size, id); }); }); queue.wait_and_throw(); diff --git a/sdk/test/main_vecsqadd_fixture.cc b/sdk/test/main_vecsqadd_fixture.cc index c55a9f2..91e4fb7 100644 --- a/sdk/test/main_vecsqadd_fixture.cc +++ b/sdk/test/main_vecsqadd_fixture.cc @@ -214,8 +214,6 @@ void RunExternalCorrIdTest(queue &q, const DoubleVector &a, const DoubleVector & vecAdd(q, a, b, sq_add); print_results(sq_add, vector_size); - // print_queue_info(q); - StartTracing(); vecAdd(q, c, d, sq_add2); StopTracing(); @@ -259,34 +257,6 @@ void RunVecsqadd(TestType a_test_type) { auto d_selector{gpu_selector_v}; queue q(d_selector, NULL); - // Underlying queue handle object changes based on value of - // SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=? - auto print_queue_info = [](const sycl::queue &sycl_queue) { - auto queue_type = get_native(sycl_queue); -#if __LIBSYCL_MAJOR_VERSION >= 6 && __LIBSYCL_MINOR_VERSION >= 2 - // 1 (default) - if (auto *ptr_queue_handle = std::get_if(&queue_type)) { - printf("Queue ptr: 0x%p, native queue: 0x%p, native device: 0x%p \n", &sycl_queue, - ptr_queue_handle, - get_native(sycl_queue.get_device())); - - // 0 - } else if (auto *ptr_queue_handle = std::get_if(&queue_type)) { - printf("Queue ptr: 0x%p, native queue: 0x%p, native device: 0x%p \n", &sycl_queue, - ptr_queue_handle, - get_native(sycl_queue.get_device())); - } else { - std::cerr << "Underlying level zero queue handle could not be obtained." << '\n'; - } -#else - printf("Queue ptr: 0x%p, native queue: 0x%p, native device: 0x%p \n", &sycl_queue, - get_native(sycl_queue), - get_native(sycl_queue.get_device())); -#endif - }; - - print_queue_info(q); - // Start Tests by Type if (a_test_type == TestType::RUN_ALL) {