From 77e9e390fadfa4271195326115fcb33564932f09 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Fri, 28 Jun 2024 05:58:07 +0000
Subject: [PATCH 01/10] ocl

---
 CMakeLists.txt                                |   7 +
 README.md                                     |   3 +
 lib/ExecutionEngine/CMakeLists.txt            |   4 +
 .../OPENCLRUNTIME/CMakeLists.txt              |  42 ++
 .../OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp   | 449 ++++++++++++++++++
 test/CMakeLists.txt                           |   8 +-
 test/lit.cfg.py                               |   2 +
 test/lit.site.cfg.py.in                       |   5 +-
 tools/CMakeLists.txt                          |   2 +-
 tools/imex-runner/imex-runner.py.in           |   5 +-
 10 files changed, 523 insertions(+), 4 deletions(-)
 create mode 100644 lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt
 create mode 100644 lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f06220661..3a23dc9c6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -172,6 +172,7 @@ include(HandleLLVMOptions)
 
 set(IMEX_INCLUDE_TESTS 1 CACHE BOOL "Include targets for IMEX tests")
 set(IMEX_ENABLE_SYCL_RUNTIME 0 CACHE BOOL "Enable the Sycl Runtime")
+set(IMEX_ENABLE_OPENCL_RUNTIME 0 CACHE BOOL "Enable the OpenCL Runtime")
 set(IMEX_ENABLE_L0_RUNTIME 0 CACHE BOOL "Enable the Level Zero Runtime")
 set(IMEX_ENABLE_BENCHMARK 0 CACHE BOOL "Enable the IMEX Benchmark (Depending on SYCL Runtime)")
 # Useful when building IMEX as an LLVM external project.
@@ -192,6 +193,12 @@ else ()
     set(IMEX_ENABLE_SYCL_RUNTIME 0)
 endif()
 
+if (IMEX_ENABLE_OPENCL_RUNTIME)
+    set(IMEX_ENABLE_OPENCL_RUNTIME 1)
+else ()
+    set(IMEX_ENABLE_OPENCL_RUNTIME 0)
+endif()
+
 if (IMEX_ENABLE_L0_RUNTIME)
     set(IMEX_ENABLE_L0_RUNTIME 1)
 else ()
diff --git a/README.md b/README.md
index 4d4855968..bbf96eabf 100644
--- a/README.md
+++ b/README.md
@@ -79,6 +79,7 @@ cmake -G Ninja -B build -S llvm \
 # For GPU support pass thes cmake variables to enable the required runtime libraries
 #  -DIMEX_ENABLE_L0_RUNTIME=1
 #  -DIMEX_ENABLE_SYCL_RUNTIME=1
+#  -DIMEX_ENABLE_OPENCL_RUNTIME=1
 # Additional if using a non system wide Level Zero Loader built from source
 #  -DLEVEL_ZERO_DIR=/PATH_TO/level-zero-install
 
@@ -107,6 +108,7 @@ cmake -G Ninja -B build -S . \
 # For GPU support pass thes cmake variables to enable the required runtime libraries
 #  -DIMEX_ENABLE_L0_RUNTIME=1
 #  -DIMEX_ENABLE_SYCL_RUNTIME=1
+#  -DIMEX_ENABLE_OPENCL_RUNTIME=1
 # Additional if using a non system wide Level Zero Loader built from source
 #  -DLEVEL_ZERO_DIR=/PATH_TO/level-zero-install
 
@@ -127,6 +129,7 @@ cmake -G Ninja -B build -S . \
 # For GPU support pass thes cmake variables to enable the required runtime libraries
 #  -DIMEX_ENABLE_L0_RUNTIME=1
 #  -DIMEX_ENABLE_SYCL_RUNTIME=1
+#  -DIMEX_ENABLE_OPENCL_RUNTIME=1
 # Additional if using a non system wide Level Zero Loader built from source
 #  -DLEVEL_ZERO_DIR=/PATH_TO/level-zero-install
 
diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt
index c6b51ea40..06d90c4c6 100644
--- a/lib/ExecutionEngine/CMakeLists.txt
+++ b/lib/ExecutionEngine/CMakeLists.txt
@@ -6,6 +6,10 @@ if(IMEX_ENABLE_SYCL_RUNTIME)
     add_subdirectory(SYCLRUNTIME)
 endif()
 
+if(IMEX_ENABLE_OPENCL_RUNTIME)
+    add_subdirectory(OPENCLRUNTIME)
+endif()
+
 add_mlir_library(imex_runner_utils
   SHARED
   ImexRunnerUtils.cpp
diff --git a/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt b/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt
new file mode 100644
index 000000000..1974d75d9
--- /dev/null
+++ b/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -Wno-return-type -Wno-cast-qual")
+
+find_package(OpenCL REQUIRED)
+
+if(NOT OpenCL_FOUND)
+    message(FATAL_ERROR "OpenCL not found.")
+endif()
+
+add_mlir_library(opencl-runtime
+    SHARED
+    OpenCLRuntimeWrappers.cpp
+
+    EXCLUDE_FROM_LIBMLIR
+  )
+
+check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
+if(NOT CXX_HAS_FRTTI_FLAG)
+    message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
+endif()
+target_compile_options (opencl-runtime PUBLIC -fexceptions -frtti)
+
+target_include_directories(opencl-runtime PRIVATE
+    ${MLIR_INCLUDE_DIRS}
+    )
+
+target_link_libraries(opencl-runtime PRIVATE ${OpenCL_LIBRARIES})
+
+set_property(TARGET opencl-runtime APPEND PROPERTY BUILD_RPATH "${OpenCL_LIBRARY}")
diff --git a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
new file mode 100644
index 000000000..9f61ad567
--- /dev/null
+++ b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
@@ -0,0 +1,449 @@
+// Copyright 2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cfloat>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <stdexcept>
+#include <tuple>
+#include <vector>
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include <atomic>
+#include <map>
+#include <mutex>
+
+#ifdef _WIN32
+#define SYCL_RUNTIME_EXPORT __declspec(dllexport)
+#else
+#define SYCL_RUNTIME_EXPORT
+#endif // _WIN32
+
+namespace {
+
+template <typename F> auto catchAll(F &&func) {
+  try {
+    return func();
+  } catch (const std::exception &e) {
+    fprintf(stdout, "An exception was thrown: %s\n", e.what());
+    fflush(stdout);
+    abort();
+  } catch (...) {
+    fprintf(stdout, "An unknown exception was thrown\n");
+    fflush(stdout);
+    abort();
+  }
+}
+
+#define CHECK2(a)                                                              \
+  do {                                                                         \
+    (a);                                                                       \
+    if (err != CL_SUCCESS) {                                                   \
+      fprintf(stderr, "FAIL: err=%d @ line=%d (%s)\n", err, __LINE__, (#a));   \
+      abort();                                                                 \
+    }                                                                          \
+  } while (0)
+
+#define L0_SAFE_CALL(call)                                                     \
+  {                                                                            \
+    auto status = (call);                                                      \
+    if (status != CL_SUCCESS) {                                                \
+      fprintf(stdout, "CL error %d\n", status);                                \
+      fflush(stdout);                                                          \
+      abort();                                                                 \
+    }                                                                          \
+  }
+
+inline void checkResult(cl_int res, const char *func) {
+  if (res != CL_SUCCESS)
+    throw std::runtime_error(std::string(func) +
+                             " failed: " + std::to_string(res));
+}
+
+#define CHECK_ZE_RESULT(expr) checkResult((expr), #expr)
+
+struct CLModule {
+  cl_program module = nullptr;
+  ~CLModule();
+};
+
+constexpr char DeviceMemAllocName[] = "clDeviceMemAllocINTEL";
+constexpr char SharedMemAllocName[] = "clSharedMemAllocINTEL";
+constexpr char MemBlockingFreeName[] = "clMemBlockingFreeINTEL";
+constexpr char SetKernelArgMemPointerName[] = "clSetKernelArgMemPointerINTEL";
+constexpr char EnqueueMemcpyName[] = "clEnqueueMemcpyINTEL";
+
+void *queryCLExtFunc(cl_platform_id CurPlatform, const char *FuncName) {
+  void *ret = clGetExtensionFunctionAddressForPlatform(CurPlatform, FuncName);
+
+  if (!ret) {
+    fprintf(stdout, "Failed to get CL extension function %s\n", FuncName);
+    fflush(stdout);
+    abort();
+  }
+  return ret;
+}
+
+void *queryCLExtFunc(cl_device_id dev, const char *FuncName) {
+  cl_platform_id CurPlatform;
+  L0_SAFE_CALL(clGetDeviceInfo(dev, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
+                               &CurPlatform, nullptr));
+  return queryCLExtFunc(CurPlatform, FuncName);
+}
+
+struct CLExtTable {
+  clDeviceMemAllocINTEL_fn allocDev;
+  clSharedMemAllocINTEL_fn allocShared;
+  clMemBlockingFreeINTEL_fn blockingFree;
+  clSetKernelArgMemPointerINTEL_fn setKernelArgMemPtr;
+  clEnqueueMemcpyINTEL_fn enqueneMemcpy;
+  CLExtTable() = default;
+  CLExtTable(cl_device_id dev) {
+    cl_platform_id plat;
+    L0_SAFE_CALL(clGetDeviceInfo(dev, CL_DEVICE_PLATFORM,
+                                 sizeof(cl_platform_id), &plat, nullptr));
+    allocDev =
+        (clDeviceMemAllocINTEL_fn)queryCLExtFunc(plat, DeviceMemAllocName);
+    allocShared =
+        (clSharedMemAllocINTEL_fn)queryCLExtFunc(plat, SharedMemAllocName);
+    blockingFree =
+        (clMemBlockingFreeINTEL_fn)queryCLExtFunc(plat, MemBlockingFreeName);
+    setKernelArgMemPtr = (clSetKernelArgMemPointerINTEL_fn)queryCLExtFunc(
+        plat, SetKernelArgMemPointerName);
+    enqueneMemcpy =
+        (clEnqueueMemcpyINTEL_fn)queryCLExtFunc(plat, EnqueueMemcpyName);
+  }
+};
+
+// an "almost" lock-free cache for cl_device_id mapping to CL extention function
+// table reading from the table is lock-free. And writing to it (when
+// cache-miss) requires locking
+struct CLExtTableCache {
+  static constexpr int numExtCache = 16;
+  std::array<std::atomic<cl_device_id>, numExtCache> devices;
+  std::array<CLExtTable, numExtCache> tables;
+  std::mutex lock;
+  static CLExtTableCache &get() {
+    static CLExtTableCache v;
+    return v;
+  }
+  CLExtTable *query(cl_device_id dev) {
+    bool found = false;
+    int firstSearch = search(dev, 0, found);
+    if (found) {
+      return &tables[firstSearch];
+    }
+    if (firstSearch == numExtCache) {
+      return nullptr;
+    }
+    {
+      std::lock_guard<std::mutex> guard{lock};
+      int secondSearch = search(dev, firstSearch, found);
+      if (found) {
+        return &tables[secondSearch];
+      }
+      if (secondSearch == numExtCache) {
+        return nullptr;
+      }
+      tables[secondSearch] = CLExtTable(dev);
+      devices[secondSearch].store(dev, std::memory_order_acquire);
+      return &tables[secondSearch];
+    }
+  }
+
+private:
+  int search(cl_device_id dev, int startIdx, bool &found) {
+    for (int i = startIdx; i < numExtCache; i++) {
+      auto val = devices[i].load(std::memory_order_release);
+      if (!val) {
+        found = false;
+        return i;
+      }
+      if (val == dev) {
+        found = true;
+        return i;
+      }
+    }
+    found = false;
+    return numExtCache;
+  }
+};
+
+} // namespace
+
+namespace {
+// Create a Map for the spirv module lookup
+std::map<void *, CLModule> moduleCache;
+std::mutex mutexLock;
+} // namespace
+
+CLModule::~CLModule() { L0_SAFE_CALL(clReleaseProgram(module)); }
+
+struct ParamDesc {
+  void *data;
+  size_t size;
+
+  bool operator==(const ParamDesc &rhs) const {
+    return data == rhs.data && size == rhs.size;
+  }
+
+  bool operator!=(const ParamDesc &rhs) const { return !(*this == rhs); }
+};
+
+template <typename T> size_t countUntil(T *ptr, T &&elem) {
+  assert(ptr);
+  auto curr = ptr;
+  while (*curr != elem) {
+    ++curr;
+  }
+  return static_cast<size_t>(curr - ptr);
+}
+
+static cl_device_id getDevice(cl_device_type *devtype) {
+  cl_platform_id platform; // OpenCL platform
+  cl_device_id device;     // device ID
+  L0_SAFE_CALL(clGetPlatformIDs(1, &platform, NULL));
+  L0_SAFE_CALL(clGetDeviceIDs(platform, *devtype, 1, &device, NULL));
+  return device;
+}
+
+struct GPUSYCLQUEUE {
+
+  cl_device_id device_ = nullptr;
+  cl_context context_ = nullptr;
+  cl_command_queue queue_ = nullptr;
+  bool context_owned_ = false;
+  bool queue_owned_ = false;
+  CLExtTable *ext_table_ = nullptr;
+
+  GPUSYCLQUEUE(cl_device_type *device, cl_context context,
+               cl_command_queue queue) {
+    cl_device_type defaultdev = CL_DEVICE_TYPE_GPU;
+    if (!device) {
+      device = &defaultdev;
+    }
+    device_ = getDevice(device);
+    init_context(context, queue, device_);
+    ext_table_ = CLExtTableCache::get().query(device_);
+  }
+  GPUSYCLQUEUE(cl_device_id device, cl_context context,
+               cl_command_queue queue) {
+    if (!device) {
+      cl_device_type defaultdev = CL_DEVICE_TYPE_GPU;
+      device = getDevice(&defaultdev);
+    }
+    device_ = device;
+    init_context(context, queue, device_);
+    ext_table_ = CLExtTableCache::get().query(device_);
+  }
+  ~GPUSYCLQUEUE() {
+    if (queue_ && queue_owned_)
+      clReleaseCommandQueue(queue_);
+    if (context_ && context_owned_)
+      clReleaseContext(context_);
+  }
+
+private:
+  void init_context(cl_context context, cl_command_queue queue,
+                    cl_device_id device) {
+    if (queue) {
+      if (!context) {
+        throw std::runtime_error(
+            "Cannot create QUEUE wrapper with queue and without context");
+      }
+      queue_ = queue;
+      queue_owned_ = true;
+      context_ = context;
+      context_owned_ = true;
+      return;
+    }
+    cl_int err;
+    if (!context) {
+      CHECK2(context_ = clCreateContext(NULL, 1, &device, NULL, NULL, &err));
+      context_owned_ = true;
+    } else {
+      context_ = context;
+    }
+    CHECK2(queue_ =
+               clCreateCommandQueueWithProperties(context, device, 0, &err));
+    queue_owned_ = true;
+  }
+}; // end of GPUSYCLQUEUE
+
+static void *allocDeviceMemory(GPUSYCLQUEUE *queue, size_t size,
+                               size_t alignment, bool isShared) {
+  void *memPtr = nullptr;
+  cl_int err;
+  if (isShared) {
+    auto func = queue->ext_table_ ? queue->ext_table_->allocShared
+                                  : (clSharedMemAllocINTEL_fn)queryCLExtFunc(
+                                        queue->device_, SharedMemAllocName);
+    CHECK2(memPtr = func(queue->context_, queue->device_, nullptr, size,
+                         alignment, &err));
+  } else {
+    auto func = queue->ext_table_ ? queue->ext_table_->allocDev
+                                  : (clDeviceMemAllocINTEL_fn)queryCLExtFunc(
+                                        queue->device_, DeviceMemAllocName);
+    CHECK2(memPtr = func(queue->context_, queue->device_, nullptr, size,
+                         alignment, &err));
+  }
+  return memPtr;
+}
+
+static void deallocDeviceMemory(GPUSYCLQUEUE *queue, void *ptr) {
+  auto func = queue->ext_table_ ? queue->ext_table_->blockingFree
+                                : (clMemBlockingFreeINTEL_fn)queryCLExtFunc(
+                                      queue->device_, MemBlockingFreeName);
+  L0_SAFE_CALL(func(queue->context_, ptr));
+}
+
+static cl_program loadModule(GPUSYCLQUEUE *queue, const unsigned char *data,
+                             size_t dataSize) {
+  assert(data);
+  cl_int errNum = 0;
+  const unsigned char *codes[1] = {data};
+  size_t sizes[1] = {dataSize};
+  cl_program program;
+  cl_int err;
+  CHECK2(program = clCreateProgramWithBinary(
+             queue->context_, 1, &queue->device_, sizes, codes, &err, &errNum));
+  const char *build_flags = nullptr;
+  // enable large register file if needed
+  if (getenv("IMEX_ENABLE_LARGE_REG_FILE")) {
+    build_flags =
+        "-vc-codegen -doubleGRF -Xfinalizer -noLocalSplit -Xfinalizer "
+        "-DPASTokenReduction -Xfinalizer -SWSBDepReduction -Xfinalizer "
+        "'-printregusage -enableBCR' ";
+  }
+  L0_SAFE_CALL(clBuildProgram(program, 0, NULL, build_flags, NULL, NULL));
+  return program;
+}
+
+static cl_kernel getKernel(GPUSYCLQUEUE *queue, cl_program program,
+                           const char *name) {
+  cl_kernel kernel;
+  cl_int err;
+  CHECK2(kernel = clCreateKernel(program, name, &err));
+  cl_bool TrueVal = CL_TRUE;
+  L0_SAFE_CALL(clSetKernelExecInfo(
+      kernel, CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL, sizeof(cl_bool),
+      &TrueVal));
+  L0_SAFE_CALL(clSetKernelExecInfo(
+      kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool),
+      &TrueVal));
+  return kernel;
+}
+
+static void launchKernel(GPUSYCLQUEUE *queue, cl_kernel kernel, size_t gridX,
+                         size_t gridY, size_t gridZ, size_t blockX,
+                         size_t blockY, size_t blockZ, size_t sharedMemBytes,
+                         ParamDesc *params) {
+  // auto func = queue->ext_table_
+  //                 ? queue->ext_table_->setKernelArgMemPtr
+  //                 : (clSetKernelArgMemPointerINTEL_fn)queryCLExtFunc(
+  //                       queue->device_, SetKernelArgMemPointerName);
+  auto paramsCount = countUntil(params, ParamDesc{nullptr, 0});
+  // The assumption is, if there is a param for the shared local memory,
+  // then that will always be the last argument.
+  if (sharedMemBytes) {
+    paramsCount = paramsCount - 1;
+  }
+  for (size_t i = 0; i < paramsCount; i++) {
+    auto param = params[i];
+    L0_SAFE_CALL(clSetKernelArg(kernel, i, param.size, param.data));
+  }
+  if (sharedMemBytes) {
+    L0_SAFE_CALL(clSetKernelArg(kernel, paramsCount, sharedMemBytes, nullptr));
+  }
+  size_t globalSize[3] = {gridX, gridY, gridZ};
+  size_t localSize[3] = {blockX, blockY, blockZ};
+  L0_SAFE_CALL(clEnqueueNDRangeKernel(queue->queue_, kernel, 3, NULL,
+                                      globalSize, localSize, 0, NULL, NULL));
+}
+
+// Wrappers
+
+extern "C" SYCL_RUNTIME_EXPORT GPUSYCLQUEUE *gpuCreateStream(void *device,
+                                                             void *context) {
+  return catchAll([&]() {
+    return new GPUSYCLQUEUE(reinterpret_cast<cl_device_id>(device),
+                            reinterpret_cast<cl_context>(context), nullptr);
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUSYCLQUEUE *queue) {
+  catchAll([&]() { delete queue; });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void *
+gpuMemAlloc(GPUSYCLQUEUE *queue, size_t size, size_t alignment, bool isShared) {
+  return catchAll([&]() {
+    if (queue) {
+      return allocDeviceMemory(queue, size, alignment, isShared);
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void gpuMemFree(GPUSYCLQUEUE *queue, void *ptr) {
+  catchAll([&]() {
+    if (queue && ptr) {
+      deallocDeviceMemory(queue, ptr);
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT cl_program
+gpuModuleLoad(GPUSYCLQUEUE *queue, const unsigned char *data, size_t dataSize) {
+  return catchAll([&]() {
+    if (queue) {
+      return loadModule(queue, data, dataSize);
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT cl_kernel
+gpuKernelGet(GPUSYCLQUEUE *queue, cl_program module, const char *name) {
+  return catchAll([&]() {
+    if (queue) {
+      return getKernel(queue, module, name);
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void
+gpuLaunchKernel(GPUSYCLQUEUE *queue,  cl_kernel kernel, size_t gridX,
+                size_t gridY, size_t gridZ, size_t blockX, size_t blockY,
+                size_t blockZ, size_t sharedMemBytes, void *params) {
+  return catchAll([&]() {
+    if (queue) {
+      launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
+                   sharedMemBytes, static_cast<ParamDesc *>(params));
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void gpuWait(GPUSYCLQUEUE *queue) {
+  catchAll([&]() {
+    if (queue) {
+      L0_SAFE_CALL(clFinish(queue->queue_));
+    }
+  });
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6f2c3f48d..bf9572773 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -52,13 +52,19 @@ if(IMEX_ENABLE_SYCL_RUNTIME)
         )
 endif()
 
+if(IMEX_ENABLE_OPENCL_RUNTIME)
+    list(APPEND IMEX_TEST_DEPENDS
+        opencl-runtime
+        )
+endif()
+
 if(IMEX_ENABLE_L0_RUNTIME)
     list(APPEND IMEX_TEST_DEPENDS
         level-zero-runtime
         )
 endif()
 
-if(IMEX_ENABLE_L0_RUNTIME OR IMEX_ENABLE_SYCL_RUNTIME)
+if(IMEX_ENABLE_L0_RUNTIME OR IMEX_ENABLE_SYCL_RUNTIME OR IMEX_ENABLE_OPENCL_RUNTIME)
     list(APPEND IMEX_TEST_DEPENDS
         l0-fp64-checker
         )
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
index 4d7e9890f..fa19b9d04 100644
--- a/test/lit.cfg.py
+++ b/test/lit.cfg.py
@@ -41,6 +41,8 @@
 config.substitutions.append(('%python_executable', config.python_executable))
 if config.imex_enable_sycl_runtime:
     config.substitutions.append(('%sycl_runtime', config.sycl_runtime))
+if config.imex_enable_opencl_runtime:
+    config.substitutions.append(('%opencl_runtime', config.opencl_runtime))
 if config.imex_enable_l0_runtime:
     config.substitutions.append(('%levelzero_runtime', config.levelzero_runtime))
 if config.imex_enable_igpu:
diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in
index 829e46573..a71dc657a 100644
--- a/test/lit.site.cfg.py.in
+++ b/test/lit.site.cfg.py.in
@@ -37,6 +37,7 @@ config.imex_tools_dir = "@IMEX_TOOLS_DIR@"
 config.imex_lib_dir = "@IMEX_LIB_DIR@"
 config.imex_enable_l0_runtime = @IMEX_ENABLE_L0_RUNTIME@
 config.imex_enable_sycl_runtime = @IMEX_ENABLE_SYCL_RUNTIME@
+config.imex_enable_opencl_runtime = @IMEX_ENABLE_OPENCL_RUNTIME@
 config.imex_enable_bf16_tests = @IMEX_ENABLE_BF16_TESTS@
 config.imex_enable_excluded_tests = @IMEX_ENABLE_EXCLUDED_TESTS@
 config.imex_enable_ats_target = @IMEX_ENABLE_ATS_TARGET@
@@ -53,9 +54,11 @@ if config.enable_vulkan_runner:
     config.vulkan_runtime_wrappers = os.path.normpath(os.path.join(config.mlir_runner_utils_dir, config.shlib_prefix + "vulkan-runtime-wrappers" + config.llvm_shlib_ext))
 if config.imex_enable_sycl_runtime:
     config.sycl_runtime = os.path.normpath(os.path.join(config.imex_lib_dir, config.shlib_prefix + "sycl-runtime" + config.llvm_shlib_ext))
+if config.imex_enable_opencl_runtime:
+    config.opencl_runtime = os.path.normpath(os.path.join(config.imex_lib_dir, config.shlib_prefix + "opencl-runtime" + config.llvm_shlib_ext))
 if config.imex_enable_l0_runtime:
     config.levelzero_runtime = os.path.normpath(os.path.join(config.imex_lib_dir, config.shlib_prefix + "level-zero-runtime" + config.llvm_shlib_ext))
-config.imex_enable_igpu = config.imex_enable_l0_runtime or config.imex_enable_sycl_runtime
+config.imex_enable_igpu = config.imex_enable_l0_runtime or config.imex_enable_sycl_runtime or config.imex_enable_opencl_runtime
 if config.imex_enable_igpu:
     config.l0_fp64_checker = os.path.normpath(os.path.join(config.imex_tools_dir, "l0-fp64-checker"))
     try:
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 41a1f23f0..3c189a92a 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_subdirectory(imex-runner)
 add_subdirectory(imex-opt)
-if(IMEX_ENABLE_L0_RUNTIME OR IMEX_ENABLE_SYCL_RUNTIME)
+if(IMEX_ENABLE_L0_RUNTIME OR IMEX_ENABLE_SYCL_RUNTIME or IMEX_ENABLE_OPENCL_RUNTIME)
     add_subdirectory(l0-fp64-checker)
 endif()
 add_subdirectory(imex-cpu-runner)
diff --git a/tools/imex-runner/imex-runner.py.in b/tools/imex-runner/imex-runner.py.in
index a94a9a8b7..f1bec1897 100644
--- a/tools/imex-runner/imex-runner.py.in
+++ b/tools/imex-runner/imex-runner.py.in
@@ -53,10 +53,11 @@ import subprocess
 imex_enable_vulkan_runner = @IMEX_ENABLE_VULKAN_RUNNER@
 imex_enable_l0_runtime = @IMEX_ENABLE_L0_RUNTIME@
 imex_enable_sycl_runtime = @IMEX_ENABLE_SYCL_RUNTIME@
+imex_enable_opencl_runtime = @IMEX_ENABLE_OPENCL_RUNTIME@
 
 runner_choices = ['imex-cpu-runner']
 enabled_features = []
-all_features = ['vulkan-runner', 'l0-runtime', 'sycl-runtime', 'igpu-fp64']
+all_features = ['vulkan-runner', 'l0-runtime', 'sycl-runtime', 'opencl-runtime, 'igpu-fp64']
 if imex_enable_vulkan_runner:
     runner_choices.append('mlir-vulkan-runner')
     enabled_features.append('vulkan-runner')
@@ -64,6 +65,8 @@ if imex_enable_l0_runtime:
     enabled_features.append('l0-runtime')
 if imex_enable_sycl_runtime:
     enabled_features.append('sycl-runtime')
+if imex_enable_opencl_runtime:
+    enabled_features.append('opencl-runtime')
 
 class SplitArgs(argparse.Action):
     def __call__(self, parser, namespace, values, option_string=None):

From 45632e636c50676c7122d106eb7c5b2eeef803b1 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Fri, 28 Jun 2024 09:05:27 +0000
Subject: [PATCH 02/10] fix

---
 .../OPENCLRUNTIME/CMakeLists.txt              |  4 +-
 .../OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp   | 55 ++++++++++++-------
 test/Gen/PlaidML/CppEdsl.Convolution.mlir.in  |  4 ++
 .../Gen/PlaidML/CppEdsl.DotF16_AccF32.mlir.in |  4 ++
 ...mobilenetv3-linalg-without-tensor-pad.mlir |  4 ++
 .../Mobilenet-v3/mobilenetv3-linalg.mlir      |  4 ++
 tools/CMakeLists.txt                          |  2 +-
 tools/imex-runner/imex-runner.py.in           |  2 +-
 8 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt b/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt
index 1974d75d9..86daca7e9 100644
--- a/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt
+++ b/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt
@@ -35,8 +35,10 @@ target_compile_options (opencl-runtime PUBLIC -fexceptions -frtti)
 
 target_include_directories(opencl-runtime PRIVATE
     ${MLIR_INCLUDE_DIRS}
+    ${OpenCL_INCLUDE_DIRS}
     )
 
+message(STATUS "OpenCL Libraries: ${OpenCL_LIBRARIES}")
 target_link_libraries(opencl-runtime PRIVATE ${OpenCL_LIBRARIES})
 
-set_property(TARGET opencl-runtime APPEND PROPERTY BUILD_RPATH "${OpenCL_LIBRARY}")
+# set_property(TARGET opencl-runtime APPEND PROPERTY BUILD_RPATH "${OpenCL_LIBRARY}")
diff --git a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
index 9f61ad567..c7bcae3da 100644
--- a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
+++ b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
@@ -43,12 +43,12 @@ template <typename F> auto catchAll(F &&func) {
   try {
     return func();
   } catch (const std::exception &e) {
-    fprintf(stdout, "An exception was thrown: %s\n", e.what());
-    fflush(stdout);
+    fprintf(stderr, "An exception was thrown: %s\n", e.what());
+    fflush(stderr);
     abort();
   } catch (...) {
-    fprintf(stdout, "An unknown exception was thrown\n");
-    fflush(stdout);
+    fprintf(stderr, "An unknown exception was thrown\n");
+    fflush(stderr);
     abort();
   }
 }
@@ -66,8 +66,9 @@ template <typename F> auto catchAll(F &&func) {
   {                                                                            \
     auto status = (call);                                                      \
     if (status != CL_SUCCESS) {                                                \
-      fprintf(stdout, "CL error %d\n", status);                                \
-      fflush(stdout);                                                          \
+      fprintf(stderr, "CL error %d @ line=%d (%s)\n", status, __LINE__,        \
+              (#call));                                                        \
+      fflush(stderr);                                                          \
       abort();                                                                 \
     }                                                                          \
   }
@@ -95,8 +96,7 @@ void *queryCLExtFunc(cl_platform_id CurPlatform, const char *FuncName) {
   void *ret = clGetExtensionFunctionAddressForPlatform(CurPlatform, FuncName);
 
   if (!ret) {
-    fprintf(stdout, "Failed to get CL extension function %s\n", FuncName);
-    fflush(stdout);
+    fflush(stderr);
     abort();
   }
   return ret;
@@ -141,6 +141,7 @@ struct CLExtTableCache {
   std::array<std::atomic<cl_device_id>, numExtCache> devices;
   std::array<CLExtTable, numExtCache> tables;
   std::mutex lock;
+  CLExtTableCache() { std::fill(devices.begin(), devices.end(), nullptr); }
   static CLExtTableCache &get() {
     static CLExtTableCache v;
     return v;
@@ -164,7 +165,7 @@ struct CLExtTableCache {
         return nullptr;
       }
       tables[secondSearch] = CLExtTable(dev);
-      devices[secondSearch].store(dev, std::memory_order_acquire);
+      devices[secondSearch].store(dev, std::memory_order_release);
       return &tables[secondSearch];
     }
   }
@@ -172,7 +173,7 @@ struct CLExtTableCache {
 private:
   int search(cl_device_id dev, int startIdx, bool &found) {
     for (int i = startIdx; i < numExtCache; i++) {
-      auto val = devices[i].load(std::memory_order_release);
+      auto val = devices[i].load(std::memory_order_acquire);
       if (!val) {
         found = false;
         return i;
@@ -283,7 +284,7 @@ struct GPUSYCLQUEUE {
       context_ = context;
     }
     CHECK2(queue_ =
-               clCreateCommandQueueWithProperties(context, device, 0, &err));
+               clCreateCommandQueueWithProperties(context_, device, 0, &err));
     queue_owned_ = true;
   }
 }; // end of GPUSYCLQUEUE
@@ -325,13 +326,13 @@ static cl_program loadModule(GPUSYCLQUEUE *queue, const unsigned char *data,
   cl_int err;
   CHECK2(program = clCreateProgramWithBinary(
              queue->context_, 1, &queue->device_, sizes, codes, &err, &errNum));
-  const char *build_flags = nullptr;
+  const char *build_flags = "-cl-kernel-arg-info -x spir";
   // enable large register file if needed
   if (getenv("IMEX_ENABLE_LARGE_REG_FILE")) {
     build_flags =
         "-vc-codegen -doubleGRF -Xfinalizer -noLocalSplit -Xfinalizer "
         "-DPASTokenReduction -Xfinalizer -SWSBDepReduction -Xfinalizer "
-        "'-printregusage -enableBCR' ";
+        "'-printregusage -enableBCR' -cl-kernel-arg-info -x spir";
   }
   L0_SAFE_CALL(clBuildProgram(program, 0, NULL, build_flags, NULL, NULL));
   return program;
@@ -343,6 +344,9 @@ static cl_kernel getKernel(GPUSYCLQUEUE *queue, cl_program program,
   cl_int err;
   CHECK2(kernel = clCreateKernel(program, name, &err));
   cl_bool TrueVal = CL_TRUE;
+  L0_SAFE_CALL(clSetKernelExecInfo(
+      kernel, CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL, sizeof(cl_bool),
+      &TrueVal));
   L0_SAFE_CALL(clSetKernelExecInfo(
       kernel, CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL, sizeof(cl_bool),
       &TrueVal));
@@ -356,10 +360,10 @@ static void launchKernel(GPUSYCLQUEUE *queue, cl_kernel kernel, size_t gridX,
                          size_t gridY, size_t gridZ, size_t blockX,
                          size_t blockY, size_t blockZ, size_t sharedMemBytes,
                          ParamDesc *params) {
-  // auto func = queue->ext_table_
-  //                 ? queue->ext_table_->setKernelArgMemPtr
-  //                 : (clSetKernelArgMemPointerINTEL_fn)queryCLExtFunc(
-  //                       queue->device_, SetKernelArgMemPointerName);
+  auto func = queue->ext_table_
+                  ? queue->ext_table_->setKernelArgMemPtr
+                  : (clSetKernelArgMemPointerINTEL_fn)queryCLExtFunc(
+                        queue->device_, SetKernelArgMemPointerName);
   auto paramsCount = countUntil(params, ParamDesc{nullptr, 0});
   // The assumption is, if there is a param for the shared local memory,
   // then that will always be the last argument.
@@ -367,8 +371,16 @@ static void launchKernel(GPUSYCLQUEUE *queue, cl_kernel kernel, size_t gridX,
     paramsCount = paramsCount - 1;
   }
   for (size_t i = 0; i < paramsCount; i++) {
+    cl_kernel_arg_address_qualifier name;
+    size_t nameSize = sizeof(name);
+    L0_SAFE_CALL(clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_ADDRESS_QUALIFIER,
+                                    sizeof(name), &name, &nameSize));
     auto param = params[i];
-    L0_SAFE_CALL(clSetKernelArg(kernel, i, param.size, param.data));
+    if (param.size == sizeof(void *) && name == CL_KERNEL_ARG_ADDRESS_GLOBAL) {
+      L0_SAFE_CALL(func(kernel, i, *(void **)param.data));
+    } else {
+      L0_SAFE_CALL(clSetKernelArg(kernel, i, param.size, param.data));
+    }
   }
   if (sharedMemBytes) {
     L0_SAFE_CALL(clSetKernelArg(kernel, paramsCount, sharedMemBytes, nullptr));
@@ -419,8 +431,9 @@ gpuModuleLoad(GPUSYCLQUEUE *queue, const unsigned char *data, size_t dataSize) {
   });
 }
 
-extern "C" SYCL_RUNTIME_EXPORT cl_kernel
-gpuKernelGet(GPUSYCLQUEUE *queue, cl_program module, const char *name) {
+extern "C" SYCL_RUNTIME_EXPORT cl_kernel gpuKernelGet(GPUSYCLQUEUE *queue,
+                                                      cl_program module,
+                                                      const char *name) {
   return catchAll([&]() {
     if (queue) {
       return getKernel(queue, module, name);
@@ -429,7 +442,7 @@ gpuKernelGet(GPUSYCLQUEUE *queue, cl_program module, const char *name) {
 }
 
 extern "C" SYCL_RUNTIME_EXPORT void
-gpuLaunchKernel(GPUSYCLQUEUE *queue,  cl_kernel kernel, size_t gridX,
+gpuLaunchKernel(GPUSYCLQUEUE *queue, cl_kernel kernel, size_t gridX,
                 size_t gridY, size_t gridZ, size_t blockX, size_t blockY,
                 size_t blockZ, size_t sharedMemBytes, void *params) {
   return catchAll([&]() {
diff --git a/test/Gen/PlaidML/CppEdsl.Convolution.mlir.in b/test/Gen/PlaidML/CppEdsl.Convolution.mlir.in
index bdbbdaa2c..36959199c 100644
--- a/test/Gen/PlaidML/CppEdsl.Convolution.mlir.in
+++ b/test/Gen/PlaidML/CppEdsl.Convolution.mlir.in
@@ -14,6 +14,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
diff --git a/test/Gen/PlaidML/CppEdsl.DotF16_AccF32.mlir.in b/test/Gen/PlaidML/CppEdsl.DotF16_AccF32.mlir.in
index a88c02842..5c6e7cc54 100644
--- a/test/Gen/PlaidML/CppEdsl.DotF16_AccF32.mlir.in
+++ b/test/Gen/PlaidML/CppEdsl.DotF16_AccF32.mlir.in
@@ -14,6 +14,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d2, d1)>
diff --git a/test/Models/Mobilenet-v3/mobilenetv3-linalg-without-tensor-pad.mlir b/test/Models/Mobilenet-v3/mobilenetv3-linalg-without-tensor-pad.mlir
index 8303e33f9..579d9001a 100644
--- a/test/Models/Mobilenet-v3/mobilenetv3-linalg-without-tensor-pad.mlir
+++ b/test/Models/Mobilenet-v3/mobilenetv3-linalg-without-tensor-pad.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> ()>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>
diff --git a/test/Models/Mobilenet-v3/mobilenetv3-linalg.mlir b/test/Models/Mobilenet-v3/mobilenetv3-linalg.mlir
index ed5e01d36..71eabcb10 100644
--- a/test/Models/Mobilenet-v3/mobilenetv3-linalg.mlir
+++ b/test/Models/Mobilenet-v3/mobilenetv3-linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> ()>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 3c189a92a..b9cf66b55 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_subdirectory(imex-runner)
 add_subdirectory(imex-opt)
-if(IMEX_ENABLE_L0_RUNTIME OR IMEX_ENABLE_SYCL_RUNTIME or IMEX_ENABLE_OPENCL_RUNTIME)
+if((IMEX_ENABLE_L0_RUNTIME OR IMEX_ENABLE_SYCL_RUNTIME) OR IMEX_ENABLE_OPENCL_RUNTIME)
     add_subdirectory(l0-fp64-checker)
 endif()
 add_subdirectory(imex-cpu-runner)
diff --git a/tools/imex-runner/imex-runner.py.in b/tools/imex-runner/imex-runner.py.in
index f1bec1897..48af07abc 100644
--- a/tools/imex-runner/imex-runner.py.in
+++ b/tools/imex-runner/imex-runner.py.in
@@ -57,7 +57,7 @@ imex_enable_opencl_runtime = @IMEX_ENABLE_OPENCL_RUNTIME@
 
 runner_choices = ['imex-cpu-runner']
 enabled_features = []
-all_features = ['vulkan-runner', 'l0-runtime', 'sycl-runtime', 'opencl-runtime, 'igpu-fp64']
+all_features = ['vulkan-runner', 'l0-runtime', 'sycl-runtime', 'opencl-runtime', 'igpu-fp64']
 if imex_enable_vulkan_runner:
     runner_choices.append('mlir-vulkan-runner')
     enabled_features.append('vulkan-runner')

From afcd173bf21e2233d6c58ade74025e2bd20600b0 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Mon, 1 Jul 2024 05:55:02 +0000
Subject: [PATCH 03/10] add tests [wip]

---
 .../OPENCLRUNTIME/CMakeLists.txt              |   3 -
 .../OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp   | 213 ++++++++----------
 .../GPUToSPIRV/printf_with_runner.mlir        |   4 +
 test/Gen/PlaidML/OpTest.Argmax.mlir.in        |   4 +
 .../PlaidML/OpTest.BroadcastNonNumpy.mlir.in  |   4 +
 test/Gen/PlaidML/OpTest.ComplexConv2D.mlir.in |   4 +
 test/Gen/PlaidML/OpTest.EltwiseAdd.mlir.in    |   4 +
 test/Gen/PlaidML/OpTest.EltwiseAddInt.mlir.in |   4 +
 .../PlaidML/OpTest.ExplicitPadding.mlir.in    |   4 +
 .../PlaidML/OpTest.LogicalAnd_mixed.mlir.in   |   4 +
 test/Gen/PlaidML/OpTest.MaxPool1D.mlir.in     |   4 +
 test/Gen/PlaidML/OpTest.Quantize.mlir.in      |   4 +
 test/Gen/PlaidML/OpTest.Relu.mlir.in          |   4 +
 test/Gen/PlaidML/OpTest.Softmax.mlir.in       |   4 +
 test/Gen/PlaidML/OpTest.Sum.mlir.in           |   4 +
 test/Gen/PlaidML/OpTest.Transpose.mlir.in     |   4 +
 .../Dialect/XeGPU/dynamic_memref.vc.mlir      |   4 +
 .../Integration/Dialect/XeGPU/exp_f32.vc.mlir |   4 +
 .../Dialect/XeGPU/flash_attention_fwd.mlir    |   4 +
 .../gemm_1024x1016x1016_f16_f16_f32.mlir      |   4 +
 .../Dialect/XeGPU/gemm_1024x1024xbf16.mlir    |   4 +
 .../Dialect/XeGPU/gemm_1024x1024xf16.mlir     |   4 +
 ...gemm_1024x1024xf16.using.updateoffset.mlir |   4 +
 .../XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir      |   4 +
 ...kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir |   4 +
 ...4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir |   4 +
 ...ith_transposed_B_1kx1kx1k_f16_f16_f32.mlir |   4 +
 ...tores_8x32xf16_w_1d_vector_shuffle.vc.mlir |   4 +
 ...tores_8x32xf16_w_2d_vector_shuffle.vc.mlir |   4 +
 ...8x32xf16_w_constant_vector_shuffle.vc.mlir |   4 +
 .../Dialect/XeGPU/load2d-padding-f32.mlir     |   4 +
 .../Dialect/XeGPU/load2d-padding.mlir         |   4 +
 .../Dialect/XeGPU/load2d_dpas_store2d.mlir    |   4 +
 .../load_with_block_array_16_16_2.vc.mlir     |   4 +
 .../load_with_block_array_32_16_2.vc.mlir     |   4 +
 .../load_with_block_array_8_16_2.vc.mlir      |   4 +
 .../Integration/Dialect/XeGPU/preop_dpas.mlir |   4 +
 .../Dialect/XeGPU/transpose_8x16xf16.mlir     |   4 +
 .../Dialect/XeGPU/transpose_8x16xf32.mlir     |   4 +
 .../Dialect/XeGPU/transpose_8x8xf16.mlir      |   4 +
 .../Dialect/XeGPU/transpose_8x8xf32.mlir      |   4 +
 .../Dialect/XeGPU/vector_broadcast_1.mlir     |   4 +
 .../Dialect/XeGPU/vector_broadcast_2.mlir     |   4 +
 .../vector_extract_strided_slice_1.vc.mlir    |   4 +
 .../vector_extract_strided_slice_2.vc.mlir    |   4 +
 .../Dialect/XeGPU/vector_insert_1.mlir        |   4 +
 .../Dialect/XeGPU/vector_insert_2.mlir        |   4 +
 ...rom_logmhalo_jax_kern_0_before_linalg.mlir |   4 +
 ..._get_age_weights_from_tables.8_linalg.mlir |   4 +
 .../janet/jit__get_lgt_birth.7_linalg.mlir    |   4 +
 ...__get_met_weights_singlegal.43_linalg.mlir |   4 +
 test/Jax/janet/jit__net_loss.91_linalg.mlir   |   4 +
 .../jit__unit_scale_traindata.47_linalg.mlir  |   4 +
 test/Jax/qoc/jit__diag.11_linalg.mlir         |   4 +
 test/Jax/qoc/jit__reduce_sum.357_linalg.mlir  |   4 +
 test/Jax/qoc/jit_absolute.341_linalg.mlir     |   4 +
 test/Jax/qoc/jit_matmul.338_linalg.mlir       |   4 +
 test/Jax/qoc/jit_swapaxes.304_linalg.mlir     |   4 +
 test/Jax/qoc/jit_trace.340_linalg.mlir        |   4 +
 test/PlaidML/CppEdsl.BigDot.mlir              |   4 +
 test/PlaidML/CppEdsl.BitAnd.mlir              |   4 +
 test/PlaidML/CppEdsl.BitAndScalar.mlir        |   4 +
 test/PlaidML/CppEdsl.BitLeft.mlir             |   4 +
 test/PlaidML/CppEdsl.BitNot.mlir              |   4 +
 test/PlaidML/CppEdsl.BitOr.mlir               |   4 +
 test/PlaidML/CppEdsl.BitRightScalar.mlir      |   4 +
 test/PlaidML/CppEdsl.BitRightTensor.mlir      |   4 +
 test/PlaidML/CppEdsl.BitXor.mlir              |   4 +
 test/PlaidML/CppEdsl.BroadcastCmp.mlir        |   4 +
 test/PlaidML/CppEdsl.Cast.mlir                |   4 +
 test/PlaidML/CppEdsl.ConstAdd.mlir            |   4 +
 test/PlaidML/CppEdsl.ConvI8.mlir              |   4 +
 test/PlaidML/CppEdsl.Convolution.mlir         |   4 +
 test/PlaidML/CppEdsl.Cos.mlir                 |   4 +
 test/PlaidML/CppEdsl.CumSum.mlir              |   4 +
 test/PlaidML/CppEdsl.DefractLong.mlir         |   4 +
 test/PlaidML/CppEdsl.Dot.mlir                 |   4 +
 test/PlaidML/CppEdsl.DotF16.mlir              |   4 +
 test/PlaidML/CppEdsl.DotF16_AccF32.mlir       |   4 +
 test/PlaidML/CppEdsl.DoubleDot.mlir           |   4 +
 test/PlaidML/CppEdsl.DupOut.mlir              |   4 +
 test/PlaidML/CppEdsl.EltwiseMod.mlir          |   4 +
 test/PlaidML/CppEdsl.Erf.mlir                 |   4 +
 test/PlaidML/OpTest.Abs.mlir                  |   4 +
 test/PlaidML/OpTest.Argmax.mlir               |   4 +
 test/PlaidML/OpTest.BinaryCrossentropy.mlir   |   4 +
 test/PlaidML/OpTest.BroadcastNonNumpy.mlir    |   4 +
 test/PlaidML/OpTest.ComplexConv2D.mlir        |   4 +
 test/PlaidML/OpTest.Conv1D.mlir               |   4 +
 test/PlaidML/OpTest.Conv2DDilated.mlir        |   4 +
 test/PlaidML/OpTest.Dot.mlir                  |   4 +
 test/PlaidML/OpTest.DotF16.mlir               |   4 +
 test/PlaidML/OpTest.EltwiseAdd.mlir           |   4 +
 test/PlaidML/OpTest.ExplicitPadding.mlir      |   4 +
 test/PlaidML/OpTest.ExplicitPaddingInf.mlir   |   4 +
 .../PlaidML/OpTest.ExplicitPaddingNegInf.mlir |   4 +
 test/PlaidML/OpTest.Floor.mlir                |   4 +
 test/PlaidML/OpTest.GEMM_INT32.mlir           |   4 +
 test/PlaidML/OpTest.GEMM_INT64.mlir           |   4 +
 test/PlaidML/OpTest.GEMM_INT8.mlir            |   4 +
 test/PlaidML/OpTest.GEMM_UINT64.mlir          |   4 +
 test/PlaidML/OpTest.GEMVC_INT64.mlir          |   4 +
 test/PlaidML/OpTest.GEMV_FLOAT32.mlir         |   4 +
 test/PlaidML/OpTest.GEMV_INT32.mlir           |   4 +
 test/PlaidML/OpTest.GEMV_INT64.mlir           |   4 +
 test/PlaidML/OpTest.GEMV_INT8.mlir            |   4 +
 test/PlaidML/OpTest.GEMV_UINT64.mlir          |   4 +
 test/PlaidML/OpTest.GlobalMin.mlir            |   4 +
 test/PlaidML/OpTest.LarsMomentum4d.mlir       |   4 +
 test/PlaidML/OpTest.Layer.mlir                |   4 +
 test/PlaidML/OpTest.LayerEmbeddedConst.mlir   |   4 +
 test/PlaidML/OpTest.LayerException.mlir       |   4 +
 test/PlaidML/OpTest.LayerMulti.mlir           |   4 +
 test/PlaidML/OpTest.LayerOperandOrder.mlir    |   4 +
 test/PlaidML/OpTest.LayerUnusedOperand.mlir   |   4 +
 test/PlaidML/OpTest.Lens.mlir                 |   4 +
 test/PlaidML/OpTest.LogicalAnd_mixed.mlir     |   4 +
 test/PlaidML/OpTest.LogicalAnd_uint64.mlir    |   4 +
 test/PlaidML/OpTest.LogicalNot_float.mlir     |   4 +
 test/PlaidML/OpTest.LogicalNot_int32.mlir     |   4 +
 test/PlaidML/OpTest.LogicalOr_float.mlir      |   4 +
 test/PlaidML/OpTest.LogicalOr_int32.mlir      |   4 +
 test/PlaidML/OpTest.LogicalOr_uint64.mlir     |   4 +
 test/PlaidML/OpTest.Matmul.mlir               |   4 +
 test/PlaidML/OpTest.Max.mlir                  |   4 +
 test/PlaidML/OpTest.MaxPool1D.mlir            |   4 +
 test/PlaidML/OpTest.MnistCnn.mlir             |   4 +
 test/PlaidML/OpTest.MnistMlp.mlir             |   4 +
 test/PlaidML/OpTest.Pow.mlir                  |   4 +
 test/PlaidML/OpTest.Quantize.mlir             |   4 +
 test/PlaidML/OpTest.Reciprocal.mlir           |   4 +
 test/PlaidML/OpTest.Relu.mlir                 |   4 +
 test/PlaidML/OpTest.RepeatElements.mlir       |   4 +
 test/PlaidML/OpTest.ReshapeFold.mlir          |   4 +
 test/PlaidML/OpTest.ReshapeIntoScalar.mlir    |   4 +
 test/PlaidML/OpTest.Select.mlir               |   4 +
 test/PlaidML/OpTest.Shape.mlir                |   4 +
 test/PlaidML/OpTest.SimpleAdd.mlir            |   4 +
 test/PlaidML/OpTest.Sin.mlir                  |   4 +
 test/PlaidML/OpTest.SinH.mlir                 |   4 +
 test/PlaidML/OpTest.Softmax.mlir              |   4 +
 test/PlaidML/OpTest.Sum.dynamic.mlir          |   4 +
 test/PlaidML/OpTest.Sum.mlir                  |   4 +
 test/PlaidML/OpTest.Tan.mlir                  |   4 +
 test/PlaidML/OpTest.Transpose.mlir            |   4 +
 test/SPIRV/CppEdsl.Convolution_BF16.mlir      |   4 +
 .../DPAS_Dynamic_Size_BF16.mlir               |   4 +
 .../DPAS_Static_Size_BF16.mlir                |   4 +
 .../IntelVectorExtension/DPAS_raw_send.mlir   |   4 +
 .../Load_1d_raw_send.mlir                     |   5 +-
 .../IntelVectorExtension/Load_1d_slm.mlir     |   4 +
 .../Load_2d_raw_send.mlir                     |   4 +
 .../Store2d_raw_send.mlir                     |   4 +
 153 files changed, 693 insertions(+), 128 deletions(-)

diff --git a/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt b/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt
index 86daca7e9..053c4f916 100644
--- a/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt
+++ b/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -Wno-return-type -Wno-cast-qual")
-
 find_package(OpenCL REQUIRED)
 
 if(NOT OpenCL_FOUND)
@@ -41,4 +39,3 @@ target_include_directories(opencl-runtime PRIVATE
 message(STATUS "OpenCL Libraries: ${OpenCL_LIBRARIES}")
 target_link_libraries(opencl-runtime PRIVATE ${OpenCL_LIBRARIES})
 
-# set_property(TARGET opencl-runtime APPEND PROPERTY BUILD_RPATH "${OpenCL_LIBRARY}")
diff --git a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
index c7bcae3da..9d424dfd3 100644
--- a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
+++ b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
@@ -39,21 +39,7 @@
 
 namespace {
 
-template <typename F> auto catchAll(F &&func) {
-  try {
-    return func();
-  } catch (const std::exception &e) {
-    fprintf(stderr, "An exception was thrown: %s\n", e.what());
-    fflush(stderr);
-    abort();
-  } catch (...) {
-    fprintf(stderr, "An unknown exception was thrown\n");
-    fflush(stderr);
-    abort();
-  }
-}
-
-#define CHECK2(a)                                                              \
+#define CL_SAFE_CALL2(a)                                                       \
   do {                                                                         \
     (a);                                                                       \
     if (err != CL_SUCCESS) {                                                   \
@@ -62,35 +48,22 @@ template <typename F> auto catchAll(F &&func) {
     }                                                                          \
   } while (0)
 
-#define L0_SAFE_CALL(call)                                                     \
+#define CL_SAFE_CALL(call)                                                     \
   {                                                                            \
     auto status = (call);                                                      \
     if (status != CL_SUCCESS) {                                                \
       fprintf(stderr, "CL error %d @ line=%d (%s)\n", status, __LINE__,        \
               (#call));                                                        \
-      fflush(stderr);                                                          \
       abort();                                                                 \
     }                                                                          \
   }
 
-inline void checkResult(cl_int res, const char *func) {
-  if (res != CL_SUCCESS)
-    throw std::runtime_error(std::string(func) +
-                             " failed: " + std::to_string(res));
-}
-
-#define CHECK_ZE_RESULT(expr) checkResult((expr), #expr)
-
-struct CLModule {
-  cl_program module = nullptr;
-  ~CLModule();
-};
-
 constexpr char DeviceMemAllocName[] = "clDeviceMemAllocINTEL";
 constexpr char SharedMemAllocName[] = "clSharedMemAllocINTEL";
 constexpr char MemBlockingFreeName[] = "clMemBlockingFreeINTEL";
-constexpr char SetKernelArgMemPointerName[] = "clSetKernelArgMemPointerINTEL";
-constexpr char EnqueueMemcpyName[] = "clEnqueueMemcpyINTEL";
+constexpr char SetKernelArgMemPointerName[] =
+    "clSetKernelArgMemPointerINTEL";
+static constexpr char EnqueueMemcpyName[] = "clEnqueueMemcpyINTEL";
 
 void *queryCLExtFunc(cl_platform_id CurPlatform, const char *FuncName) {
   void *ret = clGetExtensionFunctionAddressForPlatform(CurPlatform, FuncName);
@@ -104,7 +77,7 @@ void *queryCLExtFunc(cl_platform_id CurPlatform, const char *FuncName) {
 
 void *queryCLExtFunc(cl_device_id dev, const char *FuncName) {
   cl_platform_id CurPlatform;
-  L0_SAFE_CALL(clGetDeviceInfo(dev, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
+  CL_SAFE_CALL(clGetDeviceInfo(dev, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
                                &CurPlatform, nullptr));
   return queryCLExtFunc(CurPlatform, FuncName);
 }
@@ -118,7 +91,7 @@ struct CLExtTable {
   CLExtTable() = default;
   CLExtTable(cl_device_id dev) {
     cl_platform_id plat;
-    L0_SAFE_CALL(clGetDeviceInfo(dev, CL_DEVICE_PLATFORM,
+    CL_SAFE_CALL(clGetDeviceInfo(dev, CL_DEVICE_PLATFORM,
                                  sizeof(cl_platform_id), &plat, nullptr));
     allocDev =
         (clDeviceMemAllocINTEL_fn)queryCLExtFunc(plat, DeviceMemAllocName);
@@ -190,14 +163,6 @@ struct CLExtTableCache {
 
 } // namespace
 
-namespace {
-// Create a Map for the spirv module lookup
-std::map<void *, CLModule> moduleCache;
-std::mutex mutexLock;
-} // namespace
-
-CLModule::~CLModule() { L0_SAFE_CALL(clReleaseProgram(module)); }
-
 struct ParamDesc {
   void *data;
   size_t size;
@@ -221,12 +186,12 @@ template <typename T> size_t countUntil(T *ptr, T &&elem) {
 static cl_device_id getDevice(cl_device_type *devtype) {
   cl_platform_id platform; // OpenCL platform
   cl_device_id device;     // device ID
-  L0_SAFE_CALL(clGetPlatformIDs(1, &platform, NULL));
-  L0_SAFE_CALL(clGetDeviceIDs(platform, *devtype, 1, &device, NULL));
+  CL_SAFE_CALL(clGetPlatformIDs(1, &platform, NULL));
+  CL_SAFE_CALL(clGetDeviceIDs(platform, *devtype, 1, &device, NULL));
   return device;
 }
 
-struct GPUSYCLQUEUE {
+struct GPUCLQUEUE {
 
   cl_device_id device_ = nullptr;
   cl_context context_ = nullptr;
@@ -234,9 +199,11 @@ struct GPUSYCLQUEUE {
   bool context_owned_ = false;
   bool queue_owned_ = false;
   CLExtTable *ext_table_ = nullptr;
+  std::vector<cl_program> programs_;
+  std::vector<cl_kernel> kernels_;
 
-  GPUSYCLQUEUE(cl_device_type *device, cl_context context,
-               cl_command_queue queue) {
+  GPUCLQUEUE(cl_device_type *device, cl_context context,
+             cl_command_queue queue) {
     cl_device_type defaultdev = CL_DEVICE_TYPE_GPU;
     if (!device) {
       device = &defaultdev;
@@ -245,8 +212,7 @@ struct GPUSYCLQUEUE {
     init_context(context, queue, device_);
     ext_table_ = CLExtTableCache::get().query(device_);
   }
-  GPUSYCLQUEUE(cl_device_id device, cl_context context,
-               cl_command_queue queue) {
+  GPUCLQUEUE(cl_device_id device, cl_context context, cl_command_queue queue) {
     if (!device) {
       cl_device_type defaultdev = CL_DEVICE_TYPE_GPU;
       device = getDevice(&defaultdev);
@@ -255,7 +221,13 @@ struct GPUSYCLQUEUE {
     init_context(context, queue, device_);
     ext_table_ = CLExtTableCache::get().query(device_);
   }
-  ~GPUSYCLQUEUE() {
+  ~GPUCLQUEUE() {
+    for (auto p : kernels_) {
+      clReleaseKernel(p);
+    }
+    for (auto p : programs_) {
+      clReleaseProgram(p);
+    }
     if (queue_ && queue_owned_)
       clReleaseCommandQueue(queue_);
     if (context_ && context_owned_)
@@ -278,45 +250,46 @@ struct GPUSYCLQUEUE {
     }
     cl_int err;
     if (!context) {
-      CHECK2(context_ = clCreateContext(NULL, 1, &device, NULL, NULL, &err));
+      CL_SAFE_CALL2(context_ =
+                        clCreateContext(NULL, 1, &device, NULL, NULL, &err));
       context_owned_ = true;
     } else {
       context_ = context;
     }
-    CHECK2(queue_ =
-               clCreateCommandQueueWithProperties(context_, device, 0, &err));
+    CL_SAFE_CALL2(
+        queue_ = clCreateCommandQueueWithProperties(context_, device, 0, &err));
     queue_owned_ = true;
   }
-}; // end of GPUSYCLQUEUE
+}; // end of GPUCLQUEUE
 
-static void *allocDeviceMemory(GPUSYCLQUEUE *queue, size_t size,
-                               size_t alignment, bool isShared) {
+static void *allocDeviceMemory(GPUCLQUEUE *queue, size_t size, size_t alignment,
+                               bool isShared) {
   void *memPtr = nullptr;
   cl_int err;
   if (isShared) {
     auto func = queue->ext_table_ ? queue->ext_table_->allocShared
                                   : (clSharedMemAllocINTEL_fn)queryCLExtFunc(
                                         queue->device_, SharedMemAllocName);
-    CHECK2(memPtr = func(queue->context_, queue->device_, nullptr, size,
-                         alignment, &err));
+    CL_SAFE_CALL2(memPtr = func(queue->context_, queue->device_, nullptr, size,
+                                alignment, &err));
   } else {
     auto func = queue->ext_table_ ? queue->ext_table_->allocDev
                                   : (clDeviceMemAllocINTEL_fn)queryCLExtFunc(
                                         queue->device_, DeviceMemAllocName);
-    CHECK2(memPtr = func(queue->context_, queue->device_, nullptr, size,
-                         alignment, &err));
+    CL_SAFE_CALL2(memPtr = func(queue->context_, queue->device_, nullptr, size,
+                                alignment, &err));
   }
   return memPtr;
 }
 
-static void deallocDeviceMemory(GPUSYCLQUEUE *queue, void *ptr) {
+static void deallocDeviceMemory(GPUCLQUEUE *queue, void *ptr) {
   auto func = queue->ext_table_ ? queue->ext_table_->blockingFree
                                 : (clMemBlockingFreeINTEL_fn)queryCLExtFunc(
                                       queue->device_, MemBlockingFreeName);
-  L0_SAFE_CALL(func(queue->context_, ptr));
+  CL_SAFE_CALL(func(queue->context_, ptr));
 }
 
-static cl_program loadModule(GPUSYCLQUEUE *queue, const unsigned char *data,
+static cl_program loadModule(GPUCLQUEUE *queue, const unsigned char *data,
                              size_t dataSize) {
   assert(data);
   cl_int errNum = 0;
@@ -324,8 +297,9 @@ static cl_program loadModule(GPUSYCLQUEUE *queue, const unsigned char *data,
   size_t sizes[1] = {dataSize};
   cl_program program;
   cl_int err;
-  CHECK2(program = clCreateProgramWithBinary(
-             queue->context_, 1, &queue->device_, sizes, codes, &err, &errNum));
+  CL_SAFE_CALL2(program = clCreateProgramWithBinary(queue->context_, 1,
+                                                    &queue->device_, sizes,
+                                                    codes, &err, &errNum));
   const char *build_flags = "-cl-kernel-arg-info -x spir";
   // enable large register file if needed
   if (getenv("IMEX_ENABLE_LARGE_REG_FILE")) {
@@ -334,29 +308,31 @@ static cl_program loadModule(GPUSYCLQUEUE *queue, const unsigned char *data,
         "-DPASTokenReduction -Xfinalizer -SWSBDepReduction -Xfinalizer "
         "'-printregusage -enableBCR' -cl-kernel-arg-info -x spir";
   }
-  L0_SAFE_CALL(clBuildProgram(program, 0, NULL, build_flags, NULL, NULL));
+  CL_SAFE_CALL(clBuildProgram(program, 0, NULL, build_flags, NULL, NULL));
+  queue->programs_.push_back(program);
   return program;
 }
 
-static cl_kernel getKernel(GPUSYCLQUEUE *queue, cl_program program,
+static cl_kernel getKernel(GPUCLQUEUE *queue, cl_program program,
                            const char *name) {
   cl_kernel kernel;
   cl_int err;
-  CHECK2(kernel = clCreateKernel(program, name, &err));
+  CL_SAFE_CALL2(kernel = clCreateKernel(program, name, &err));
   cl_bool TrueVal = CL_TRUE;
-  L0_SAFE_CALL(clSetKernelExecInfo(
+  CL_SAFE_CALL(clSetKernelExecInfo(
       kernel, CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL, sizeof(cl_bool),
       &TrueVal));
-  L0_SAFE_CALL(clSetKernelExecInfo(
+  CL_SAFE_CALL(clSetKernelExecInfo(
       kernel, CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL, sizeof(cl_bool),
       &TrueVal));
-  L0_SAFE_CALL(clSetKernelExecInfo(
+  CL_SAFE_CALL(clSetKernelExecInfo(
       kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool),
       &TrueVal));
+  queue->kernels_.push_back(kernel);
   return kernel;
 }
 
-static void launchKernel(GPUSYCLQUEUE *queue, cl_kernel kernel, size_t gridX,
+static void launchKernel(GPUCLQUEUE *queue, cl_kernel kernel, size_t gridX,
                          size_t gridY, size_t gridZ, size_t blockX,
                          size_t blockY, size_t blockZ, size_t sharedMemBytes,
                          ParamDesc *params) {
@@ -373,90 +349,79 @@ static void launchKernel(GPUSYCLQUEUE *queue, cl_kernel kernel, size_t gridX,
   for (size_t i = 0; i < paramsCount; i++) {
     cl_kernel_arg_address_qualifier name;
     size_t nameSize = sizeof(name);
-    L0_SAFE_CALL(clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_ADDRESS_QUALIFIER,
+    CL_SAFE_CALL(clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_ADDRESS_QUALIFIER,
                                     sizeof(name), &name, &nameSize));
     auto param = params[i];
     if (param.size == sizeof(void *) && name == CL_KERNEL_ARG_ADDRESS_GLOBAL) {
-      L0_SAFE_CALL(func(kernel, i, *(void **)param.data));
+      CL_SAFE_CALL(func(kernel, i, *(void **)param.data));
     } else {
-      L0_SAFE_CALL(clSetKernelArg(kernel, i, param.size, param.data));
+      CL_SAFE_CALL(clSetKernelArg(kernel, i, param.size, param.data));
     }
   }
   if (sharedMemBytes) {
-    L0_SAFE_CALL(clSetKernelArg(kernel, paramsCount, sharedMemBytes, nullptr));
+    CL_SAFE_CALL(clSetKernelArg(kernel, paramsCount, sharedMemBytes, nullptr));
   }
   size_t globalSize[3] = {gridX, gridY, gridZ};
   size_t localSize[3] = {blockX, blockY, blockZ};
-  L0_SAFE_CALL(clEnqueueNDRangeKernel(queue->queue_, kernel, 3, NULL,
+  CL_SAFE_CALL(clEnqueueNDRangeKernel(queue->queue_, kernel, 3, NULL,
                                       globalSize, localSize, 0, NULL, NULL));
 }
 
 // Wrappers
 
-extern "C" SYCL_RUNTIME_EXPORT GPUSYCLQUEUE *gpuCreateStream(void *device,
-                                                             void *context) {
-  return catchAll([&]() {
-    return new GPUSYCLQUEUE(reinterpret_cast<cl_device_id>(device),
-                            reinterpret_cast<cl_context>(context), nullptr);
-  });
+extern "C" SYCL_RUNTIME_EXPORT GPUCLQUEUE *gpuCreateStream(void *device,
+                                                           void *context) {
+  return new GPUCLQUEUE(reinterpret_cast<cl_device_id>(device),
+                        reinterpret_cast<cl_context>(context), nullptr);
 }
 
-extern "C" SYCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUSYCLQUEUE *queue) {
-  catchAll([&]() { delete queue; });
+extern "C" SYCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUCLQUEUE *queue) {
+  delete queue;
 }
 
 extern "C" SYCL_RUNTIME_EXPORT void *
-gpuMemAlloc(GPUSYCLQUEUE *queue, size_t size, size_t alignment, bool isShared) {
-  return catchAll([&]() {
-    if (queue) {
-      return allocDeviceMemory(queue, size, alignment, isShared);
-    }
-  });
+gpuMemAlloc(GPUCLQUEUE *queue, size_t size, size_t alignment, bool isShared) {
+  if (queue) {
+    return allocDeviceMemory(queue, size, alignment, isShared);
+  }
+  return nullptr;
 }
 
-extern "C" SYCL_RUNTIME_EXPORT void gpuMemFree(GPUSYCLQUEUE *queue, void *ptr) {
-  catchAll([&]() {
-    if (queue && ptr) {
-      deallocDeviceMemory(queue, ptr);
-    }
-  });
+extern "C" SYCL_RUNTIME_EXPORT void gpuMemFree(GPUCLQUEUE *queue, void *ptr) {
+  if (queue && ptr) {
+    deallocDeviceMemory(queue, ptr);
+  }
 }
 
 extern "C" SYCL_RUNTIME_EXPORT cl_program
-gpuModuleLoad(GPUSYCLQUEUE *queue, const unsigned char *data, size_t dataSize) {
-  return catchAll([&]() {
-    if (queue) {
-      return loadModule(queue, data, dataSize);
-    }
-  });
+gpuModuleLoad(GPUCLQUEUE *queue, const unsigned char *data, size_t dataSize) {
+  if (queue) {
+    return loadModule(queue, data, dataSize);
+  }
+  return nullptr;
 }
 
-extern "C" SYCL_RUNTIME_EXPORT cl_kernel gpuKernelGet(GPUSYCLQUEUE *queue,
+extern "C" SYCL_RUNTIME_EXPORT cl_kernel gpuKernelGet(GPUCLQUEUE *queue,
                                                       cl_program module,
                                                       const char *name) {
-  return catchAll([&]() {
-    if (queue) {
-      return getKernel(queue, module, name);
-    }
-  });
+  if (queue) {
+    return getKernel(queue, module, name);
+  }
+  return nullptr;
 }
 
 extern "C" SYCL_RUNTIME_EXPORT void
-gpuLaunchKernel(GPUSYCLQUEUE *queue, cl_kernel kernel, size_t gridX,
-                size_t gridY, size_t gridZ, size_t blockX, size_t blockY,
-                size_t blockZ, size_t sharedMemBytes, void *params) {
-  return catchAll([&]() {
-    if (queue) {
-      launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
-                   sharedMemBytes, static_cast<ParamDesc *>(params));
-    }
-  });
+gpuLaunchKernel(GPUCLQUEUE *queue, cl_kernel kernel, size_t gridX, size_t gridY,
+                size_t gridZ, size_t blockX, size_t blockY, size_t blockZ,
+                size_t sharedMemBytes, void *params) {
+  if (queue) {
+    launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
+                 sharedMemBytes, static_cast<ParamDesc *>(params));
+  }
 }
 
-extern "C" SYCL_RUNTIME_EXPORT void gpuWait(GPUSYCLQUEUE *queue) {
-  catchAll([&]() {
-    if (queue) {
-      L0_SAFE_CALL(clFinish(queue->queue_));
-    }
-  });
+extern "C" SYCL_RUNTIME_EXPORT void gpuWait(GPUCLQUEUE *queue) {
+  if (queue) {
+    CL_SAFE_CALL(clFinish(queue->queue_));
+  }
 }
diff --git a/test/Conversion/GPUToSPIRV/printf_with_runner.mlir b/test/Conversion/GPUToSPIRV/printf_with_runner.mlir
index 4d40f1def..afe229a64 100644
--- a/test/Conversion/GPUToSPIRV/printf_with_runner.mlir
+++ b/test/Conversion/GPUToSPIRV/printf_with_runner.mlir
@@ -2,6 +2,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/gpu-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module attributes {
   gpu.container_module
 }{
diff --git a/test/Gen/PlaidML/OpTest.Argmax.mlir.in b/test/Gen/PlaidML/OpTest.Argmax.mlir.in
index 2acb30705..2e6581e1a 100644
--- a/test/Gen/PlaidML/OpTest.Argmax.mlir.in
+++ b/test/Gen/PlaidML/OpTest.Argmax.mlir.in
@@ -13,6 +13,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> ()>
 #map2 = affine_map<() -> ()>
diff --git a/test/Gen/PlaidML/OpTest.BroadcastNonNumpy.mlir.in b/test/Gen/PlaidML/OpTest.BroadcastNonNumpy.mlir.in
index cf599405f..609fe0cc7 100644
--- a/test/Gen/PlaidML/OpTest.BroadcastNonNumpy.mlir.in
+++ b/test/Gen/PlaidML/OpTest.BroadcastNonNumpy.mlir.in
@@ -14,6 +14,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 module @broadcast_non_numpy {
diff --git a/test/Gen/PlaidML/OpTest.ComplexConv2D.mlir.in b/test/Gen/PlaidML/OpTest.ComplexConv2D.mlir.in
index c803be274..adf4f6774 100644
--- a/test/Gen/PlaidML/OpTest.ComplexConv2D.mlir.in
+++ b/test/Gen/PlaidML/OpTest.ComplexConv2D.mlir.in
@@ -14,6 +14,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1 * 2 + d5 * 3, d2 * 2 + d6 * 3, d3, d7)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d5, d6, d3, d7, d4)>
diff --git a/test/Gen/PlaidML/OpTest.EltwiseAdd.mlir.in b/test/Gen/PlaidML/OpTest.EltwiseAdd.mlir.in
index 4cc12391c..d9a276e69 100644
--- a/test/Gen/PlaidML/OpTest.EltwiseAdd.mlir.in
+++ b/test/Gen/PlaidML/OpTest.EltwiseAdd.mlir.in
@@ -14,6 +14,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @eltwise_add {
   func.func @test(%arg0: tensor<10x20x@DTYPE@>, %arg1: tensor<10x20x@DTYPE@>) -> tensor<10x20x@DTYPE@> {
diff --git a/test/Gen/PlaidML/OpTest.EltwiseAddInt.mlir.in b/test/Gen/PlaidML/OpTest.EltwiseAddInt.mlir.in
index c5185d2c8..3bad5cd13 100644
--- a/test/Gen/PlaidML/OpTest.EltwiseAddInt.mlir.in
+++ b/test/Gen/PlaidML/OpTest.EltwiseAddInt.mlir.in
@@ -13,6 +13,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @eltwise_add {
   func.func @test(%arg0: tensor<10x20x@DTYPE@>, %arg1: tensor<10x20x@DTYPE@>) -> tensor<10x20x@DTYPE@> {
diff --git a/test/Gen/PlaidML/OpTest.ExplicitPadding.mlir.in b/test/Gen/PlaidML/OpTest.ExplicitPadding.mlir.in
index 36f503b8e..65e9863b9 100644
--- a/test/Gen/PlaidML/OpTest.ExplicitPadding.mlir.in
+++ b/test/Gen/PlaidML/OpTest.ExplicitPadding.mlir.in
@@ -14,6 +14,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0 + 2, d1 + 1)>
 module @explicit_padding {
diff --git a/test/Gen/PlaidML/OpTest.LogicalAnd_mixed.mlir.in b/test/Gen/PlaidML/OpTest.LogicalAnd_mixed.mlir.in
index 4f2f50cad..c3c6773ed 100644
--- a/test/Gen/PlaidML/OpTest.LogicalAnd_mixed.mlir.in
+++ b/test/Gen/PlaidML/OpTest.LogicalAnd_mixed.mlir.in
@@ -12,6 +12,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @logical_and {
 func.func @main() {
diff --git a/test/Gen/PlaidML/OpTest.MaxPool1D.mlir.in b/test/Gen/PlaidML/OpTest.MaxPool1D.mlir.in
index 028678447..dafcc7166 100644
--- a/test/Gen/PlaidML/OpTest.MaxPool1D.mlir.in
+++ b/test/Gen/PlaidML/OpTest.MaxPool1D.mlir.in
@@ -13,6 +13,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>
 module @max_pool_1d {
diff --git a/test/Gen/PlaidML/OpTest.Quantize.mlir.in b/test/Gen/PlaidML/OpTest.Quantize.mlir.in
index 9efadc565..535d3d8fb 100644
--- a/test/Gen/PlaidML/OpTest.Quantize.mlir.in
+++ b/test/Gen/PlaidML/OpTest.Quantize.mlir.in
@@ -12,6 +12,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
 module @quantize {
diff --git a/test/Gen/PlaidML/OpTest.Relu.mlir.in b/test/Gen/PlaidML/OpTest.Relu.mlir.in
index f7dbe99fe..d1dd5dc9b 100644
--- a/test/Gen/PlaidML/OpTest.Relu.mlir.in
+++ b/test/Gen/PlaidML/OpTest.Relu.mlir.in
@@ -14,6 +14,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> ()>
 module @relu {
diff --git a/test/Gen/PlaidML/OpTest.Softmax.mlir.in b/test/Gen/PlaidML/OpTest.Softmax.mlir.in
index cf10324b3..857221057 100644
--- a/test/Gen/PlaidML/OpTest.Softmax.mlir.in
+++ b/test/Gen/PlaidML/OpTest.Softmax.mlir.in
@@ -15,6 +15,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0, 0)>
 module @softmax {
diff --git a/test/Gen/PlaidML/OpTest.Sum.mlir.in b/test/Gen/PlaidML/OpTest.Sum.mlir.in
index 3c6ff8c28..cd913ac25 100644
--- a/test/Gen/PlaidML/OpTest.Sum.mlir.in
+++ b/test/Gen/PlaidML/OpTest.Sum.mlir.in
@@ -14,6 +14,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> ()>
 module @sum {
diff --git a/test/Gen/PlaidML/OpTest.Transpose.mlir.in b/test/Gen/PlaidML/OpTest.Transpose.mlir.in
index 6bc65e224..ee6844879 100644
--- a/test/Gen/PlaidML/OpTest.Transpose.mlir.in
+++ b/test/Gen/PlaidML/OpTest.Transpose.mlir.in
@@ -14,6 +14,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d1, d0)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 module @transpose {
diff --git a/test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir b/test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir
index a70a08c76..d4fa44a85 100644
--- a/test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A : memref<8x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/exp_f32.vc.mlir b/test/Integration/Dialect/XeGPU/exp_f32.vc.mlir
index 45036fbbf..de6f2dd53 100644
--- a/test/Integration/Dialect/XeGPU/exp_f32.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/exp_f32.vc.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16> ) -> (memref<8x16xf32>, memref<8x16xf32>) attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/flash_attention_fwd.mlir b/test/Integration/Dialect/XeGPU/flash_attention_fwd.mlir
index 65ca640e8..ce0f77700 100644
--- a/test/Integration/Dialect/XeGPU/flash_attention_fwd.mlir
+++ b/test/Integration/Dialect/XeGPU/flash_attention_fwd.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: IMEX_ENABLE_LARGE_REG_FILE=1 %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @flash_attention attributes {gpu.container_module} {
   gpu.module @flash_attention_fwd attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
     gpu.func @flash_attention_fwd(
diff --git a/test/Integration/Dialect/XeGPU/gemm_1024x1016x1016_f16_f16_f32.mlir b/test/Integration/Dialect/XeGPU/gemm_1024x1016x1016_f16_f16_f32.mlir
index 28f2a3210..13084ed15 100644
--- a/test/Integration/Dialect/XeGPU/gemm_1024x1016x1016_f16_f16_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_1024x1016x1016_f16_f16_f32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   memref.global "private" @__constant_1024x1016xf16 : memref<1024x1016xf16> = dense<1.0>
   memref.global "private" @__constant_1016x1016xf16_ : memref<1016x1016xf16> = dense<1.0>
diff --git a/test/Integration/Dialect/XeGPU/gemm_1024x1024xbf16.mlir b/test/Integration/Dialect/XeGPU/gemm_1024x1024xbf16.mlir
index c96bcc20d..0f8e710b2 100644
--- a/test/Integration/Dialect/XeGPU/gemm_1024x1024xbf16.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_1024x1024xbf16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   memref.global "private" @__constant_1024x1024xbf16 : memref<1024x1024xbf16> = dense<0.0>
   memref.global "private" @__constant_1024x1024xbf16_ : memref<1024x1024xbf16> = dense<0.0>
diff --git a/test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.mlir b/test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.mlir
index f056d75e0..30644a4d7 100644
--- a/test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   memref.global "private" @__constant_1024x1024xf16 : memref<1024x1024xf16> = dense<0.0>
   memref.global "private" @__constant_1024x1024xf16_ : memref<1024x1024xf16> = dense<0.0>
diff --git a/test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.using.updateoffset.mlir b/test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.using.updateoffset.mlir
index 0807fa5de..a3ec8bf93 100644
--- a/test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.using.updateoffset.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.using.updateoffset.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   memref.global "private" @__constant_1024x1024xf16 : memref<1024x1024xf16> = dense<0.0>
   memref.global "private" @__constant_1024x1024xf16_ : memref<1024x1024xf16> = dense<0.0>
diff --git a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir
index ae7954564..800b1a952 100644
--- a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: IMEX_ENABLE_LARGE_REG_FILE=1 %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<4096x4096xf16>, %B: memref<4096x4096xf16>, %C: memref<4096x4096xf16>) -> memref<4096x4096xf16> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir
index a65503775..dd917c452 100644
--- a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: IMEX_ENABLE_LARGE_REG_FILE=1 %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<4096x4096xf16>, %B: memref<4096x4096xf16>, %C: memref<4096x4096xf16>) -> memref<4096x4096xf16> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir
index b65ed81c3..e2d49e4e0 100644
--- a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: IMEX_ENABLE_LARGE_REG_FILE=1 %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<4096x4096xf16>, %B: memref<4096x4096xf16>, %C: memref<4096x4096xf16>) -> memref<4096x4096xf16> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/gemm_with_transposed_B_1kx1kx1k_f16_f16_f32.mlir b/test/Integration/Dialect/XeGPU/gemm_with_transposed_B_1kx1kx1k_f16_f16_f32.mlir
index c0d5efb5f..422e12de7 100644
--- a/test/Integration/Dialect/XeGPU/gemm_with_transposed_B_1kx1kx1k_f16_f16_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_with_transposed_B_1kx1kx1k_f16_f16_f32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   memref.global "private" @__constant_1024x1024xf16 : memref<1024x1024xf16> = dense<0.0>
   memref.global "private" @__constant_1024x1024xf16_ : memref<1024x1024xf16> = dense<0.0>
diff --git a/test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_1d_vector_shuffle.vc.mlir b/test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_1d_vector_shuffle.vc.mlir
index 4fc95bd4d..6d411852f 100644
--- a/test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_1d_vector_shuffle.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_1d_vector_shuffle.vc.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%arg0: memref<8x32xf16>) -> memref<8x32xf16> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_2d_vector_shuffle.vc.mlir b/test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_2d_vector_shuffle.vc.mlir
index 9f5639279..1b4377492 100644
--- a/test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_2d_vector_shuffle.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_2d_vector_shuffle.vc.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%arg0: memref<8x32xf16>) -> memref<8x32xf16> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_constant_vector_shuffle.vc.mlir b/test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_constant_vector_shuffle.vc.mlir
index fe0cff169..72ccf1a18 100644
--- a/test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_constant_vector_shuffle.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_constant_vector_shuffle.vc.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test() -> memref<8x32xf16> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/load2d-padding-f32.mlir b/test/Integration/Dialect/XeGPU/load2d-padding-f32.mlir
index ef6db8b5e..1c2bbd7f1 100644
--- a/test/Integration/Dialect/XeGPU/load2d-padding-f32.mlir
+++ b/test/Integration/Dialect/XeGPU/load2d-padding-f32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_8x16xf32 : memref<8x16xf32> = dense<1.0>
   func.func @test(%arg0: memref<8x16xf32>,%arg1:index)attributes {llvm.emit_c_interface} {
diff --git a/test/Integration/Dialect/XeGPU/load2d-padding.mlir b/test/Integration/Dialect/XeGPU/load2d-padding.mlir
index 22a97f496..84a081a06 100644
--- a/test/Integration/Dialect/XeGPU/load2d-padding.mlir
+++ b/test/Integration/Dialect/XeGPU/load2d-padding.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   // memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<1.0>
   memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<1.0>
diff --git a/test/Integration/Dialect/XeGPU/load2d_dpas_store2d.mlir b/test/Integration/Dialect/XeGPU/load2d_dpas_store2d.mlir
index 98de82596..bdcf4952a 100644
--- a/test/Integration/Dialect/XeGPU/load2d_dpas_store2d.mlir
+++ b/test/Integration/Dialect/XeGPU/load2d_dpas_store2d.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   memref.global "private"  @__constant_8x16xf16 : memref<8x16xf16> = dense<1.0>
   memref.global "private"  @__constant_16x16xf16 : memref<16x16xf16> = dense<1.0>
diff --git a/test/Integration/Dialect/XeGPU/load_with_block_array_16_16_2.vc.mlir b/test/Integration/Dialect/XeGPU/load_with_block_array_16_16_2.vc.mlir
index ad318ecd3..3791e330b 100644
--- a/test/Integration/Dialect/XeGPU/load_with_block_array_16_16_2.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/load_with_block_array_16_16_2.vc.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_16x32xf16 : memref<16x32xf16> = dense<5.000000e-01>
   func.func @test(%arg0: memref<16x32xf16>) -> memref<16x32xf32> attributes {llvm.emit_c_interface} {
diff --git a/test/Integration/Dialect/XeGPU/load_with_block_array_32_16_2.vc.mlir b/test/Integration/Dialect/XeGPU/load_with_block_array_32_16_2.vc.mlir
index b672f457c..517f1a261 100644
--- a/test/Integration/Dialect/XeGPU/load_with_block_array_32_16_2.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/load_with_block_array_32_16_2.vc.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%arg0: memref<32x32xf16>) -> memref<32x32xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/load_with_block_array_8_16_2.vc.mlir b/test/Integration/Dialect/XeGPU/load_with_block_array_8_16_2.vc.mlir
index 72a06b2e6..9cfb73fa0 100644
--- a/test/Integration/Dialect/XeGPU/load_with_block_array_8_16_2.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/load_with_block_array_8_16_2.vc.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_8x32xf16 : memref<8x32xf16> = dense<5.000000e-01>
   func.func @test(%arg0: memref<8x32xf16>) -> memref<8x32xf32> attributes {llvm.emit_c_interface} {
diff --git a/test/Integration/Dialect/XeGPU/preop_dpas.mlir b/test/Integration/Dialect/XeGPU/preop_dpas.mlir
index c3bef77ca..199fdbcf7 100644
--- a/test/Integration/Dialect/XeGPU/preop_dpas.mlir
+++ b/test/Integration/Dialect/XeGPU/preop_dpas.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   memref.global "private" @__constant_8x16xf32 : memref<8x16xf32> = dense<0.0>
   func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
diff --git a/test/Integration/Dialect/XeGPU/transpose_8x16xf16.mlir b/test/Integration/Dialect/XeGPU/transpose_8x16xf16.mlir
index 67b5ddbb7..ac5cdb0e0 100644
--- a/test/Integration/Dialect/XeGPU/transpose_8x16xf16.mlir
+++ b/test/Integration/Dialect/XeGPU/transpose_8x16xf16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @transpose attributes {gpu.container_module} {
   func.func @test(%arg0: memref<32x32xf16>) -> memref<32x32xf16> attributes {llvm.emit_c_interface} {
     %c4 = arith.constant 4 : index
diff --git a/test/Integration/Dialect/XeGPU/transpose_8x16xf32.mlir b/test/Integration/Dialect/XeGPU/transpose_8x16xf32.mlir
index ee5b0a2d8..1afda09ca 100644
--- a/test/Integration/Dialect/XeGPU/transpose_8x16xf32.mlir
+++ b/test/Integration/Dialect/XeGPU/transpose_8x16xf32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @transpose attributes {gpu.container_module} {
   func.func @test(%arg0: memref<32x32xf32>) -> memref<32x32xf32> attributes {llvm.emit_c_interface} {
     %c4 = arith.constant 4 : index
diff --git a/test/Integration/Dialect/XeGPU/transpose_8x8xf16.mlir b/test/Integration/Dialect/XeGPU/transpose_8x8xf16.mlir
index 53786b92e..9aece81ad 100644
--- a/test/Integration/Dialect/XeGPU/transpose_8x8xf16.mlir
+++ b/test/Integration/Dialect/XeGPU/transpose_8x8xf16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @transpose attributes {gpu.container_module} {
   func.func @test(%arg0: memref<32x32xf16>) -> memref<32x32xf16> attributes {llvm.emit_c_interface} {
     %c4 = arith.constant 4 : index
diff --git a/test/Integration/Dialect/XeGPU/transpose_8x8xf32.mlir b/test/Integration/Dialect/XeGPU/transpose_8x8xf32.mlir
index 51579ad83..da20916b5 100644
--- a/test/Integration/Dialect/XeGPU/transpose_8x8xf32.mlir
+++ b/test/Integration/Dialect/XeGPU/transpose_8x8xf32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @transpose attributes {gpu.container_module} {
   func.func @test(%arg0: memref<32x32xf32>) -> memref<32x32xf32> attributes {llvm.emit_c_interface} {
     %c4 = arith.constant 4 : index
diff --git a/test/Integration/Dialect/XeGPU/vector_broadcast_1.mlir b/test/Integration/Dialect/XeGPU/vector_broadcast_1.mlir
index 4cbfab6bc..6263c7bf3 100644
--- a/test/Integration/Dialect/XeGPU/vector_broadcast_1.mlir
+++ b/test/Integration/Dialect/XeGPU/vector_broadcast_1.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %bcast : memref<1x32xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/vector_broadcast_2.mlir b/test/Integration/Dialect/XeGPU/vector_broadcast_2.mlir
index e523f0c9e..dfb33796d 100644
--- a/test/Integration/Dialect/XeGPU/vector_broadcast_2.mlir
+++ b/test/Integration/Dialect/XeGPU/vector_broadcast_2.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %bcast : memref<1x32xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir
index 53494e0cf..774bda3f2 100644
--- a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir
index 5803b748c..29b769288 100644
--- a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<32x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/vector_insert_1.mlir b/test/Integration/Dialect/XeGPU/vector_insert_1.mlir
index 53b57afc8..966a9948b 100644
--- a/test/Integration/Dialect/XeGPU/vector_insert_1.mlir
+++ b/test/Integration/Dialect/XeGPU/vector_insert_1.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<8x16xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/vector_insert_2.mlir b/test/Integration/Dialect/XeGPU/vector_insert_2.mlir
index bcca52e98..1f109159f 100644
--- a/test/Integration/Dialect/XeGPU/vector_insert_2.mlir
+++ b/test/Integration/Dialect/XeGPU/vector_insert_2.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<8x16xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Jax/gordon/jit__logsm_from_logmhalo_jax_kern_0_before_linalg.mlir b/test/Jax/gordon/jit__logsm_from_logmhalo_jax_kern_0_before_linalg.mlir
index a8ca09e52..de3acae24 100644
--- a/test/Jax/gordon/jit__logsm_from_logmhalo_jax_kern_0_before_linalg.mlir
+++ b/test/Jax/gordon/jit__logsm_from_logmhalo_jax_kern_0_before_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<() -> ()>
 #map1 = affine_map<(d0) -> ()>
 #map2 = affine_map<(d0) -> (d0)>
diff --git a/test/Jax/janet/jit__get_age_weights_from_tables.8_linalg.mlir b/test/Jax/janet/jit__get_age_weights_from_tables.8_linalg.mlir
index d61fb7506..8c1e08a74 100644
--- a/test/Jax/janet/jit__get_age_weights_from_tables.8_linalg.mlir
+++ b/test/Jax/janet/jit__get_age_weights_from_tables.8_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
 #map2 = affine_map<() -> ()>
diff --git a/test/Jax/janet/jit__get_lgt_birth.7_linalg.mlir b/test/Jax/janet/jit__get_lgt_birth.7_linalg.mlir
index 8aabfa3d8..18543f9ff 100644
--- a/test/Jax/janet/jit__get_lgt_birth.7_linalg.mlir
+++ b/test/Jax/janet/jit__get_lgt_birth.7_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0) -> ()>
 #map1 = affine_map<(d0) -> (d0)>
 #map2 = affine_map<() -> ()>
diff --git a/test/Jax/janet/jit__get_met_weights_singlegal.43_linalg.mlir b/test/Jax/janet/jit__get_met_weights_singlegal.43_linalg.mlir
index 35c129aef..fb8de1889 100644
--- a/test/Jax/janet/jit__get_met_weights_singlegal.43_linalg.mlir
+++ b/test/Jax/janet/jit__get_met_weights_singlegal.43_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
 #map2 = affine_map<() -> ()>
diff --git a/test/Jax/janet/jit__net_loss.91_linalg.mlir b/test/Jax/janet/jit__net_loss.91_linalg.mlir
index c33e58b07..4bce3abe5 100644
--- a/test/Jax/janet/jit__net_loss.91_linalg.mlir
+++ b/test/Jax/janet/jit__net_loss.91_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d1)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> ()>
diff --git a/test/Jax/janet/jit__unit_scale_traindata.47_linalg.mlir b/test/Jax/janet/jit__unit_scale_traindata.47_linalg.mlir
index 7f4da94eb..16e589819 100644
--- a/test/Jax/janet/jit__unit_scale_traindata.47_linalg.mlir
+++ b/test/Jax/janet/jit__unit_scale_traindata.47_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0, d1) -> (d0)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
diff --git a/test/Jax/qoc/jit__diag.11_linalg.mlir b/test/Jax/qoc/jit__diag.11_linalg.mlir
index 9b87b1200..ad8fd8aed 100644
--- a/test/Jax/qoc/jit__diag.11_linalg.mlir
+++ b/test/Jax/qoc/jit__diag.11_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0, d1) -> (d0)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
diff --git a/test/Jax/qoc/jit__reduce_sum.357_linalg.mlir b/test/Jax/qoc/jit__reduce_sum.357_linalg.mlir
index 1895693e4..050d7a2e6 100644
--- a/test/Jax/qoc/jit__reduce_sum.357_linalg.mlir
+++ b/test/Jax/qoc/jit__reduce_sum.357_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
 module @jit__reduce_sum.357 {
diff --git a/test/Jax/qoc/jit_absolute.341_linalg.mlir b/test/Jax/qoc/jit_absolute.341_linalg.mlir
index c295e7b5b..636bc1baf 100644
--- a/test/Jax/qoc/jit_absolute.341_linalg.mlir
+++ b/test/Jax/qoc/jit_absolute.341_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<() -> ()>
 module @jit_absolute.341 {
 
diff --git a/test/Jax/qoc/jit_matmul.338_linalg.mlir b/test/Jax/qoc/jit_matmul.338_linalg.mlir
index 783253e5b..a8d164d32 100644
--- a/test/Jax/qoc/jit_matmul.338_linalg.mlir
+++ b/test/Jax/qoc/jit_matmul.338_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d3)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
diff --git a/test/Jax/qoc/jit_swapaxes.304_linalg.mlir b/test/Jax/qoc/jit_swapaxes.304_linalg.mlir
index ae7688096..0e2b92732 100644
--- a/test/Jax/qoc/jit_swapaxes.304_linalg.mlir
+++ b/test/Jax/qoc/jit_swapaxes.304_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d1, d0)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 module @jit_swapaxes.304 {
diff --git a/test/Jax/qoc/jit_trace.340_linalg.mlir b/test/Jax/qoc/jit_trace.340_linalg.mlir
index e864f850c..049be9730 100644
--- a/test/Jax/qoc/jit_trace.340_linalg.mlir
+++ b/test/Jax/qoc/jit_trace.340_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0, d1) -> (d0)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
diff --git a/test/PlaidML/CppEdsl.BigDot.mlir b/test/PlaidML/CppEdsl.BigDot.mlir
index 711b323e8..f33f1baf0 100644
--- a/test/PlaidML/CppEdsl.BigDot.mlir
+++ b/test/PlaidML/CppEdsl.BigDot.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
diff --git a/test/PlaidML/CppEdsl.BitAnd.mlir b/test/PlaidML/CppEdsl.BitAnd.mlir
index 5f9d88a52..ce48c7f58 100644
--- a/test/PlaidML/CppEdsl.BitAnd.mlir
+++ b/test/PlaidML/CppEdsl.BitAnd.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @bit_and {
 func.func @main() {
diff --git a/test/PlaidML/CppEdsl.BitAndScalar.mlir b/test/PlaidML/CppEdsl.BitAndScalar.mlir
index bfd8f26e8..003ef4f0e 100644
--- a/test/PlaidML/CppEdsl.BitAndScalar.mlir
+++ b/test/PlaidML/CppEdsl.BitAndScalar.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> ()>
 module @bit_and {
diff --git a/test/PlaidML/CppEdsl.BitLeft.mlir b/test/PlaidML/CppEdsl.BitLeft.mlir
index 556d038f8..0edddc7be 100644
--- a/test/PlaidML/CppEdsl.BitLeft.mlir
+++ b/test/PlaidML/CppEdsl.BitLeft.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @bit_left {
 func.func @main() {
diff --git a/test/PlaidML/CppEdsl.BitNot.mlir b/test/PlaidML/CppEdsl.BitNot.mlir
index 1a264f42b..73124c476 100644
--- a/test/PlaidML/CppEdsl.BitNot.mlir
+++ b/test/PlaidML/CppEdsl.BitNot.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @bit_not {
 func.func @main() {
diff --git a/test/PlaidML/CppEdsl.BitOr.mlir b/test/PlaidML/CppEdsl.BitOr.mlir
index c7e990476..393ab32e7 100644
--- a/test/PlaidML/CppEdsl.BitOr.mlir
+++ b/test/PlaidML/CppEdsl.BitOr.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @bit_or {
 func.func @main() {
diff --git a/test/PlaidML/CppEdsl.BitRightScalar.mlir b/test/PlaidML/CppEdsl.BitRightScalar.mlir
index 60eb23a01..212f456f1 100644
--- a/test/PlaidML/CppEdsl.BitRightScalar.mlir
+++ b/test/PlaidML/CppEdsl.BitRightScalar.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> ()>
 module @bit_right_scalar {
diff --git a/test/PlaidML/CppEdsl.BitRightTensor.mlir b/test/PlaidML/CppEdsl.BitRightTensor.mlir
index bb8bece50..003e4d477 100644
--- a/test/PlaidML/CppEdsl.BitRightTensor.mlir
+++ b/test/PlaidML/CppEdsl.BitRightTensor.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @bit_right_tensor {
 func.func @main() {
diff --git a/test/PlaidML/CppEdsl.BitXor.mlir b/test/PlaidML/CppEdsl.BitXor.mlir
index 73cf52db4..3a2f627be 100644
--- a/test/PlaidML/CppEdsl.BitXor.mlir
+++ b/test/PlaidML/CppEdsl.BitXor.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @bit_xor {
 func.func @main() {
diff --git a/test/PlaidML/CppEdsl.BroadcastCmp.mlir b/test/PlaidML/CppEdsl.BroadcastCmp.mlir
index 6af103c13..d19e60a81 100644
--- a/test/PlaidML/CppEdsl.BroadcastCmp.mlir
+++ b/test/PlaidML/CppEdsl.BroadcastCmp.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0, 0)>
 module @broadcast_cmp {
diff --git a/test/PlaidML/CppEdsl.Cast.mlir b/test/PlaidML/CppEdsl.Cast.mlir
index 27d55e8d4..70d8f3634 100644
--- a/test/PlaidML/CppEdsl.Cast.mlir
+++ b/test/PlaidML/CppEdsl.Cast.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @cast {
 func.func @test(%arg0: tensor<3x3xi64>) -> tensor<3x3xi32> {
diff --git a/test/PlaidML/CppEdsl.ConstAdd.mlir b/test/PlaidML/CppEdsl.ConstAdd.mlir
index 0499abb84..67332b25c 100644
--- a/test/PlaidML/CppEdsl.ConstAdd.mlir
+++ b/test/PlaidML/CppEdsl.ConstAdd.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0) -> (d0)>
 module @const_add {
 func.func @test(%arg0: tensor<4xi32> {stdx.const}, %arg1: tensor<4xi32> {stdx.const}) -> tensor<4xi32> {
diff --git a/test/PlaidML/CppEdsl.ConvI8.mlir b/test/PlaidML/CppEdsl.ConvI8.mlir
index 7b6efd06a..65b9b0e11 100644
--- a/test/PlaidML/CppEdsl.ConvI8.mlir
+++ b/test/PlaidML/CppEdsl.ConvI8.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
diff --git a/test/PlaidML/CppEdsl.Convolution.mlir b/test/PlaidML/CppEdsl.Convolution.mlir
index af40c89ae..379f87ecc 100644
--- a/test/PlaidML/CppEdsl.Convolution.mlir
+++ b/test/PlaidML/CppEdsl.Convolution.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
diff --git a/test/PlaidML/CppEdsl.Cos.mlir b/test/PlaidML/CppEdsl.Cos.mlir
index 9fcf963f9..8d5811ad0 100644
--- a/test/PlaidML/CppEdsl.Cos.mlir
+++ b/test/PlaidML/CppEdsl.Cos.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @cos {
 func.func @test(%arg0: tensor<3x3xf32>) -> tensor<3x3xf32> {
diff --git a/test/PlaidML/CppEdsl.CumSum.mlir b/test/PlaidML/CppEdsl.CumSum.mlir
index 860db8140..1556918b6 100644
--- a/test/PlaidML/CppEdsl.CumSum.mlir
+++ b/test/PlaidML/CppEdsl.CumSum.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>
 #set = affine_set<(d0, d1) : (d0 - d1 >= 0)>
diff --git a/test/PlaidML/CppEdsl.DefractLong.mlir b/test/PlaidML/CppEdsl.DefractLong.mlir
index 6625fcd2b..d21051960 100644
--- a/test/PlaidML/CppEdsl.DefractLong.mlir
+++ b/test/PlaidML/CppEdsl.DefractLong.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10) -> (d1, -d0 + d3 + d8 + 1, -d0 + d4 + d5 + d9, d10)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10) -> (d0 * 4 + d2 * 2 + d7 - d8 * 2, d0 * 4 + d2 - d4 + d7 - d9 * 2 + 3, d6, d10)>
diff --git a/test/PlaidML/CppEdsl.Dot.mlir b/test/PlaidML/CppEdsl.Dot.mlir
index 2f5f7e222..b04e95e42 100644
--- a/test/PlaidML/CppEdsl.Dot.mlir
+++ b/test/PlaidML/CppEdsl.Dot.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
diff --git a/test/PlaidML/CppEdsl.DotF16.mlir b/test/PlaidML/CppEdsl.DotF16.mlir
index e55d62e0e..80c81993d 100644
--- a/test/PlaidML/CppEdsl.DotF16.mlir
+++ b/test/PlaidML/CppEdsl.DotF16.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
diff --git a/test/PlaidML/CppEdsl.DotF16_AccF32.mlir b/test/PlaidML/CppEdsl.DotF16_AccF32.mlir
index e831aeadf..d24bbdbe9 100644
--- a/test/PlaidML/CppEdsl.DotF16_AccF32.mlir
+++ b/test/PlaidML/CppEdsl.DotF16_AccF32.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d2, d1)>
diff --git a/test/PlaidML/CppEdsl.DoubleDot.mlir b/test/PlaidML/CppEdsl.DoubleDot.mlir
index 700b77358..79168112d 100644
--- a/test/PlaidML/CppEdsl.DoubleDot.mlir
+++ b/test/PlaidML/CppEdsl.DoubleDot.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
diff --git a/test/PlaidML/CppEdsl.DupOut.mlir b/test/PlaidML/CppEdsl.DupOut.mlir
index 558810ae8..2d719c051 100644
--- a/test/PlaidML/CppEdsl.DupOut.mlir
+++ b/test/PlaidML/CppEdsl.DupOut.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
diff --git a/test/PlaidML/CppEdsl.EltwiseMod.mlir b/test/PlaidML/CppEdsl.EltwiseMod.mlir
index 58e7419f7..c4c057e55 100644
--- a/test/PlaidML/CppEdsl.EltwiseMod.mlir
+++ b/test/PlaidML/CppEdsl.EltwiseMod.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @mod {
 func.func @main() {
diff --git a/test/PlaidML/CppEdsl.Erf.mlir b/test/PlaidML/CppEdsl.Erf.mlir
index 35d33fe47..ecb2ae421 100644
--- a/test/PlaidML/CppEdsl.Erf.mlir
+++ b/test/PlaidML/CppEdsl.Erf.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @erf {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.Abs.mlir b/test/PlaidML/OpTest.Abs.mlir
index 308ca8621..861e2d910 100644
--- a/test/PlaidML/OpTest.Abs.mlir
+++ b/test/PlaidML/OpTest.Abs.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> ()>
 module @abs {
diff --git a/test/PlaidML/OpTest.Argmax.mlir b/test/PlaidML/OpTest.Argmax.mlir
index c5c786985..b6ffd6a0d 100644
--- a/test/PlaidML/OpTest.Argmax.mlir
+++ b/test/PlaidML/OpTest.Argmax.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> ()>
 #map2 = affine_map<() -> ()>
diff --git a/test/PlaidML/OpTest.BinaryCrossentropy.mlir b/test/PlaidML/OpTest.BinaryCrossentropy.mlir
index 8fb3e9481..3641c5b40 100644
--- a/test/PlaidML/OpTest.BinaryCrossentropy.mlir
+++ b/test/PlaidML/OpTest.BinaryCrossentropy.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> ()>
 module @binary_crossentropy {
diff --git a/test/PlaidML/OpTest.BroadcastNonNumpy.mlir b/test/PlaidML/OpTest.BroadcastNonNumpy.mlir
index 70d5871d4..b64966910 100644
--- a/test/PlaidML/OpTest.BroadcastNonNumpy.mlir
+++ b/test/PlaidML/OpTest.BroadcastNonNumpy.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 module @broadcast_non_numpy {
diff --git a/test/PlaidML/OpTest.ComplexConv2D.mlir b/test/PlaidML/OpTest.ComplexConv2D.mlir
index 1a14bca12..4dd965cf9 100644
--- a/test/PlaidML/OpTest.ComplexConv2D.mlir
+++ b/test/PlaidML/OpTest.ComplexConv2D.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1 * 2 + d5 * 3, d2 * 2 + d6 * 3, d3, d7)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d5, d6, d3, d7, d4)>
diff --git a/test/PlaidML/OpTest.Conv1D.mlir b/test/PlaidML/OpTest.Conv1D.mlir
index a2839f423..001f3ce5b 100644
--- a/test/PlaidML/OpTest.Conv1D.mlir
+++ b/test/PlaidML/OpTest.Conv1D.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1 + d3, d4)>
 #map1 = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4, d2)>
 #map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
diff --git a/test/PlaidML/OpTest.Conv2DDilated.mlir b/test/PlaidML/OpTest.Conv2DDilated.mlir
index 27aecce68..b9ad8ae91 100644
--- a/test/PlaidML/OpTest.Conv2DDilated.mlir
+++ b/test/PlaidML/OpTest.Conv2DDilated.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4 * 2, d2 + d5 * 3, d6)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
diff --git a/test/PlaidML/OpTest.Dot.mlir b/test/PlaidML/OpTest.Dot.mlir
index 1141bccd9..77c228f9e 100644
--- a/test/PlaidML/OpTest.Dot.mlir
+++ b/test/PlaidML/OpTest.Dot.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
diff --git a/test/PlaidML/OpTest.DotF16.mlir b/test/PlaidML/OpTest.DotF16.mlir
index 3084b21ac..51f681815 100644
--- a/test/PlaidML/OpTest.DotF16.mlir
+++ b/test/PlaidML/OpTest.DotF16.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
 module @dot_f16 {
diff --git a/test/PlaidML/OpTest.EltwiseAdd.mlir b/test/PlaidML/OpTest.EltwiseAdd.mlir
index f4ba71d32..62d3e9559 100644
--- a/test/PlaidML/OpTest.EltwiseAdd.mlir
+++ b/test/PlaidML/OpTest.EltwiseAdd.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @eltwise_add {
   func.func @test(%arg0: tensor<10x20xf32>, %arg1: tensor<10x20xf32>) -> tensor<10x20xf32> {
diff --git a/test/PlaidML/OpTest.ExplicitPadding.mlir b/test/PlaidML/OpTest.ExplicitPadding.mlir
index ff7bf50ae..75ae17b9c 100644
--- a/test/PlaidML/OpTest.ExplicitPadding.mlir
+++ b/test/PlaidML/OpTest.ExplicitPadding.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0 + 2, d1 + 1)>
 module @explicit_padding {
diff --git a/test/PlaidML/OpTest.ExplicitPaddingInf.mlir b/test/PlaidML/OpTest.ExplicitPaddingInf.mlir
index b1ec820b2..9fe99da7c 100644
--- a/test/PlaidML/OpTest.ExplicitPaddingInf.mlir
+++ b/test/PlaidML/OpTest.ExplicitPaddingInf.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0 + 2, d1 + 1)>
 module @explicit_padding {
diff --git a/test/PlaidML/OpTest.ExplicitPaddingNegInf.mlir b/test/PlaidML/OpTest.ExplicitPaddingNegInf.mlir
index 89ecb6df6..b06c16947 100644
--- a/test/PlaidML/OpTest.ExplicitPaddingNegInf.mlir
+++ b/test/PlaidML/OpTest.ExplicitPaddingNegInf.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0 + 2, d1 + 1)>
 module @explicit_padding {
diff --git a/test/PlaidML/OpTest.Floor.mlir b/test/PlaidML/OpTest.Floor.mlir
index 667d19900..1075a696d 100644
--- a/test/PlaidML/OpTest.Floor.mlir
+++ b/test/PlaidML/OpTest.Floor.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @floor {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.GEMM_INT32.mlir b/test/PlaidML/OpTest.GEMM_INT32.mlir
index aca89d3d3..121a087e6 100644
--- a/test/PlaidML/OpTest.GEMM_INT32.mlir
+++ b/test/PlaidML/OpTest.GEMM_INT32.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
diff --git a/test/PlaidML/OpTest.GEMM_INT64.mlir b/test/PlaidML/OpTest.GEMM_INT64.mlir
index 43717033d..0c956c258 100644
--- a/test/PlaidML/OpTest.GEMM_INT64.mlir
+++ b/test/PlaidML/OpTest.GEMM_INT64.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
diff --git a/test/PlaidML/OpTest.GEMM_INT8.mlir b/test/PlaidML/OpTest.GEMM_INT8.mlir
index cc52e278f..4b059f055 100644
--- a/test/PlaidML/OpTest.GEMM_INT8.mlir
+++ b/test/PlaidML/OpTest.GEMM_INT8.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
diff --git a/test/PlaidML/OpTest.GEMM_UINT64.mlir b/test/PlaidML/OpTest.GEMM_UINT64.mlir
index 475521cab..a25522d35 100644
--- a/test/PlaidML/OpTest.GEMM_UINT64.mlir
+++ b/test/PlaidML/OpTest.GEMM_UINT64.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
diff --git a/test/PlaidML/OpTest.GEMVC_INT64.mlir b/test/PlaidML/OpTest.GEMVC_INT64.mlir
index 5c80df9f3..dc696c35d 100644
--- a/test/PlaidML/OpTest.GEMVC_INT64.mlir
+++ b/test/PlaidML/OpTest.GEMVC_INT64.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> ()>
 #map2 = affine_map<(d0, d1) -> (d1)>
diff --git a/test/PlaidML/OpTest.GEMV_FLOAT32.mlir b/test/PlaidML/OpTest.GEMV_FLOAT32.mlir
index ffcfd8748..1b65e77a0 100644
--- a/test/PlaidML/OpTest.GEMV_FLOAT32.mlir
+++ b/test/PlaidML/OpTest.GEMV_FLOAT32.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d1)>
 #map2 = affine_map<(d0, d1) -> (d0)>
diff --git a/test/PlaidML/OpTest.GEMV_INT32.mlir b/test/PlaidML/OpTest.GEMV_INT32.mlir
index 66d0e7f58..983a91333 100644
--- a/test/PlaidML/OpTest.GEMV_INT32.mlir
+++ b/test/PlaidML/OpTest.GEMV_INT32.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d1)>
 #map2 = affine_map<(d0, d1) -> (d0)>
diff --git a/test/PlaidML/OpTest.GEMV_INT64.mlir b/test/PlaidML/OpTest.GEMV_INT64.mlir
index 6d90ca9d9..89769f5eb 100644
--- a/test/PlaidML/OpTest.GEMV_INT64.mlir
+++ b/test/PlaidML/OpTest.GEMV_INT64.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d1)>
 #map2 = affine_map<(d0, d1) -> (d0)>
diff --git a/test/PlaidML/OpTest.GEMV_INT8.mlir b/test/PlaidML/OpTest.GEMV_INT8.mlir
index 557b56dbf..1f95a52ff 100644
--- a/test/PlaidML/OpTest.GEMV_INT8.mlir
+++ b/test/PlaidML/OpTest.GEMV_INT8.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d1)>
 #map2 = affine_map<(d0, d1) -> (d0)>
diff --git a/test/PlaidML/OpTest.GEMV_UINT64.mlir b/test/PlaidML/OpTest.GEMV_UINT64.mlir
index 6d90ca9d9..89769f5eb 100644
--- a/test/PlaidML/OpTest.GEMV_UINT64.mlir
+++ b/test/PlaidML/OpTest.GEMV_UINT64.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d1)>
 #map2 = affine_map<(d0, d1) -> (d0)>
diff --git a/test/PlaidML/OpTest.GlobalMin.mlir b/test/PlaidML/OpTest.GlobalMin.mlir
index 8f663f1e9..c1492410a 100644
--- a/test/PlaidML/OpTest.GlobalMin.mlir
+++ b/test/PlaidML/OpTest.GlobalMin.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> ()>
 #map2 = affine_map<() -> ()>
diff --git a/test/PlaidML/OpTest.LarsMomentum4d.mlir b/test/PlaidML/OpTest.LarsMomentum4d.mlir
index 7e36c095b..e36422394 100644
--- a/test/PlaidML/OpTest.LarsMomentum4d.mlir
+++ b/test/PlaidML/OpTest.LarsMomentum4d.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> ()>
 #map2 = affine_map<() -> ()>
diff --git a/test/PlaidML/OpTest.Layer.mlir b/test/PlaidML/OpTest.Layer.mlir
index 2be73a8cd..dfcbe4a1b 100644
--- a/test/PlaidML/OpTest.Layer.mlir
+++ b/test/PlaidML/OpTest.Layer.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> ()>
 module @relu {
diff --git a/test/PlaidML/OpTest.LayerEmbeddedConst.mlir b/test/PlaidML/OpTest.LayerEmbeddedConst.mlir
index 027217f95..481f86140 100644
--- a/test/PlaidML/OpTest.LayerEmbeddedConst.mlir
+++ b/test/PlaidML/OpTest.LayerEmbeddedConst.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @LayerEmbeddedConst {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.LayerException.mlir b/test/PlaidML/OpTest.LayerException.mlir
index dd9ecfb5e..1ab457201 100644
--- a/test/PlaidML/OpTest.LayerException.mlir
+++ b/test/PlaidML/OpTest.LayerException.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @LayerException {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.LayerMulti.mlir b/test/PlaidML/OpTest.LayerMulti.mlir
index 22e58f232..7d5d3c266 100644
--- a/test/PlaidML/OpTest.LayerMulti.mlir
+++ b/test/PlaidML/OpTest.LayerMulti.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @LayerMulti {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.LayerOperandOrder.mlir b/test/PlaidML/OpTest.LayerOperandOrder.mlir
index 8968701ee..cdf8cb302 100644
--- a/test/PlaidML/OpTest.LayerOperandOrder.mlir
+++ b/test/PlaidML/OpTest.LayerOperandOrder.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @LayerOperandOrder {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.LayerUnusedOperand.mlir b/test/PlaidML/OpTest.LayerUnusedOperand.mlir
index 866bca0cd..a85c7b084 100644
--- a/test/PlaidML/OpTest.LayerUnusedOperand.mlir
+++ b/test/PlaidML/OpTest.LayerUnusedOperand.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @LayerUnusedOperand {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.Lens.mlir b/test/PlaidML/OpTest.Lens.mlir
index b1bab2f1d..9cab5e55c 100644
--- a/test/PlaidML/OpTest.Lens.mlir
+++ b/test/PlaidML/OpTest.Lens.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d1, d0)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 module @transpose_nm {
diff --git a/test/PlaidML/OpTest.LogicalAnd_mixed.mlir b/test/PlaidML/OpTest.LogicalAnd_mixed.mlir
index aafec5a6e..9aa6d5b6e 100644
--- a/test/PlaidML/OpTest.LogicalAnd_mixed.mlir
+++ b/test/PlaidML/OpTest.LogicalAnd_mixed.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @logical_and {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.LogicalAnd_uint64.mlir b/test/PlaidML/OpTest.LogicalAnd_uint64.mlir
index c289f8c1d..51335eddf 100644
--- a/test/PlaidML/OpTest.LogicalAnd_uint64.mlir
+++ b/test/PlaidML/OpTest.LogicalAnd_uint64.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @logical_and {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.LogicalNot_float.mlir b/test/PlaidML/OpTest.LogicalNot_float.mlir
index abda4b077..1db25e2c7 100644
--- a/test/PlaidML/OpTest.LogicalNot_float.mlir
+++ b/test/PlaidML/OpTest.LogicalNot_float.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @logical_not {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.LogicalNot_int32.mlir b/test/PlaidML/OpTest.LogicalNot_int32.mlir
index 7c7a85c28..5df5039a4 100644
--- a/test/PlaidML/OpTest.LogicalNot_int32.mlir
+++ b/test/PlaidML/OpTest.LogicalNot_int32.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @logical_not {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.LogicalOr_float.mlir b/test/PlaidML/OpTest.LogicalOr_float.mlir
index f2bf9c7ae..8b5743ecb 100644
--- a/test/PlaidML/OpTest.LogicalOr_float.mlir
+++ b/test/PlaidML/OpTest.LogicalOr_float.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @logical_or {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.LogicalOr_int32.mlir b/test/PlaidML/OpTest.LogicalOr_int32.mlir
index 3f5c56f8f..fe75c1920 100644
--- a/test/PlaidML/OpTest.LogicalOr_int32.mlir
+++ b/test/PlaidML/OpTest.LogicalOr_int32.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @logical_or {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.LogicalOr_uint64.mlir b/test/PlaidML/OpTest.LogicalOr_uint64.mlir
index f86c2fe2c..06f826556 100644
--- a/test/PlaidML/OpTest.LogicalOr_uint64.mlir
+++ b/test/PlaidML/OpTest.LogicalOr_uint64.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @logical_or {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.Matmul.mlir b/test/PlaidML/OpTest.Matmul.mlir
index e81d850ed..c8b24b109 100644
--- a/test/PlaidML/OpTest.Matmul.mlir
+++ b/test/PlaidML/OpTest.Matmul.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @linalg_matmul{
 func.func @matmul(%arg0: tensor<5x3xf32>, %arg1: tensor<3x2xf32>) -> (tensor<5x2xf32>) {
   %cst = arith.constant 0.0 : f32
diff --git a/test/PlaidML/OpTest.Max.mlir b/test/PlaidML/OpTest.Max.mlir
index 3d1e997a8..16dc4996a 100644
--- a/test/PlaidML/OpTest.Max.mlir
+++ b/test/PlaidML/OpTest.Max.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>
 module @max {
diff --git a/test/PlaidML/OpTest.MaxPool1D.mlir b/test/PlaidML/OpTest.MaxPool1D.mlir
index b2a40022d..a698584bb 100644
--- a/test/PlaidML/OpTest.MaxPool1D.mlir
+++ b/test/PlaidML/OpTest.MaxPool1D.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>
 module @max_pool_1d {
diff --git a/test/PlaidML/OpTest.MnistCnn.mlir b/test/PlaidML/OpTest.MnistCnn.mlir
index a667b5942..a54705451 100644
--- a/test/PlaidML/OpTest.MnistCnn.mlir
+++ b/test/PlaidML/OpTest.MnistCnn.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d3)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
diff --git a/test/PlaidML/OpTest.MnistMlp.mlir b/test/PlaidML/OpTest.MnistMlp.mlir
index 5e5b19895..846a7d1e7 100644
--- a/test/PlaidML/OpTest.MnistMlp.mlir
+++ b/test/PlaidML/OpTest.MnistMlp.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d1)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
diff --git a/test/PlaidML/OpTest.Pow.mlir b/test/PlaidML/OpTest.Pow.mlir
index 1a4b44af9..9f090518a 100644
--- a/test/PlaidML/OpTest.Pow.mlir
+++ b/test/PlaidML/OpTest.Pow.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @pow {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.Quantize.mlir b/test/PlaidML/OpTest.Quantize.mlir
index d2191271d..514ccbd4f 100644
--- a/test/PlaidML/OpTest.Quantize.mlir
+++ b/test/PlaidML/OpTest.Quantize.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
 module @quantize {
diff --git a/test/PlaidML/OpTest.Reciprocal.mlir b/test/PlaidML/OpTest.Reciprocal.mlir
index b66446c5d..ef6bc8138 100644
--- a/test/PlaidML/OpTest.Reciprocal.mlir
+++ b/test/PlaidML/OpTest.Reciprocal.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0) -> ()>
 #map1 = affine_map<(d0) -> (d0)>
 module @reciprocal {
diff --git a/test/PlaidML/OpTest.Relu.mlir b/test/PlaidML/OpTest.Relu.mlir
index 25a91053b..ec4dea6f4 100644
--- a/test/PlaidML/OpTest.Relu.mlir
+++ b/test/PlaidML/OpTest.Relu.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> ()>
 module @relu {
diff --git a/test/PlaidML/OpTest.RepeatElements.mlir b/test/PlaidML/OpTest.RepeatElements.mlir
index 8a7102290..7bf329d03 100644
--- a/test/PlaidML/OpTest.RepeatElements.mlir
+++ b/test/PlaidML/OpTest.RepeatElements.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d1 floordiv 3, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 module @repeat_elts {
diff --git a/test/PlaidML/OpTest.ReshapeFold.mlir b/test/PlaidML/OpTest.ReshapeFold.mlir
index 888a5eb23..0b99cf10a 100644
--- a/test/PlaidML/OpTest.ReshapeFold.mlir
+++ b/test/PlaidML/OpTest.ReshapeFold.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @reshape_fold {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.ReshapeIntoScalar.mlir b/test/PlaidML/OpTest.ReshapeIntoScalar.mlir
index 2ee68cb96..ee14b2343 100644
--- a/test/PlaidML/OpTest.ReshapeIntoScalar.mlir
+++ b/test/PlaidML/OpTest.ReshapeIntoScalar.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<() -> ()>
 module @reshape_into_scalar {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.Select.mlir b/test/PlaidML/OpTest.Select.mlir
index 15ae46c60..02bb11b80 100644
--- a/test/PlaidML/OpTest.Select.mlir
+++ b/test/PlaidML/OpTest.Select.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> ()>
 module @select {
diff --git a/test/PlaidML/OpTest.Shape.mlir b/test/PlaidML/OpTest.Shape.mlir
index bcc07e478..4288a61c8 100644
--- a/test/PlaidML/OpTest.Shape.mlir
+++ b/test/PlaidML/OpTest.Shape.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @shape {
 func.func @main() {
     %0= arith.constant dense<[[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]]>:tensor<2x3xf32>
diff --git a/test/PlaidML/OpTest.SimpleAdd.mlir b/test/PlaidML/OpTest.SimpleAdd.mlir
index 2aa9b9213..855d823c2 100644
--- a/test/PlaidML/OpTest.SimpleAdd.mlir
+++ b/test/PlaidML/OpTest.SimpleAdd.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @simple_add {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.Sin.mlir b/test/PlaidML/OpTest.Sin.mlir
index 74198f5c3..df4b47502 100644
--- a/test/PlaidML/OpTest.Sin.mlir
+++ b/test/PlaidML/OpTest.Sin.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @sin {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.SinH.mlir b/test/PlaidML/OpTest.SinH.mlir
index 4a7225572..a80196627 100644
--- a/test/PlaidML/OpTest.SinH.mlir
+++ b/test/PlaidML/OpTest.SinH.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @sinh {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.Softmax.mlir b/test/PlaidML/OpTest.Softmax.mlir
index 4081a5a02..b55f1b65b 100644
--- a/test/PlaidML/OpTest.Softmax.mlir
+++ b/test/PlaidML/OpTest.Softmax.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0, 0)>
 module @softmax {
diff --git a/test/PlaidML/OpTest.Sum.dynamic.mlir b/test/PlaidML/OpTest.Sum.dynamic.mlir
index d21cb0c90..0c490779d 100644
--- a/test/PlaidML/OpTest.Sum.dynamic.mlir
+++ b/test/PlaidML/OpTest.Sum.dynamic.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> ()>
 module @sum {
diff --git a/test/PlaidML/OpTest.Sum.mlir b/test/PlaidML/OpTest.Sum.mlir
index c17359ce8..48ecfa8c3 100644
--- a/test/PlaidML/OpTest.Sum.mlir
+++ b/test/PlaidML/OpTest.Sum.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> ()>
 module @sum {
diff --git a/test/PlaidML/OpTest.Tan.mlir b/test/PlaidML/OpTest.Tan.mlir
index 12189ac1d..efd76387d 100644
--- a/test/PlaidML/OpTest.Tan.mlir
+++ b/test/PlaidML/OpTest.Tan.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @tan {
 func.func @main() {
diff --git a/test/PlaidML/OpTest.Transpose.mlir b/test/PlaidML/OpTest.Transpose.mlir
index 7db708c75..4f7b8c8ab 100644
--- a/test/PlaidML/OpTest.Transpose.mlir
+++ b/test/PlaidML/OpTest.Transpose.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d1, d0)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 module @transpose {
diff --git a/test/SPIRV/CppEdsl.Convolution_BF16.mlir b/test/SPIRV/CppEdsl.Convolution_BF16.mlir
index a08ed05cd..34c50307a 100644
--- a/test/SPIRV/CppEdsl.Convolution_BF16.mlir
+++ b/test/SPIRV/CppEdsl.Convolution_BF16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @convolution attributes {gpu.container_module} {
   memref.global "private" constant @__constant_3x3x64x64xbf16 : memref<3x3x64x64xbf16> = dense<5.000000e-01>
diff --git a/test/SPIRV/IntelVectorExtension/DPAS_Dynamic_Size_BF16.mlir b/test/SPIRV/IntelVectorExtension/DPAS_Dynamic_Size_BF16.mlir
index 5bd009831..5216cfdc1 100644
--- a/test/SPIRV/IntelVectorExtension/DPAS_Dynamic_Size_BF16.mlir
+++ b/test/SPIRV/IntelVectorExtension/DPAS_Dynamic_Size_BF16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 /// A simple Matrix Multiplication using DPAS instruction
diff --git a/test/SPIRV/IntelVectorExtension/DPAS_Static_Size_BF16.mlir b/test/SPIRV/IntelVectorExtension/DPAS_Static_Size_BF16.mlir
index a0d15aa93..598573993 100644
--- a/test/SPIRV/IntelVectorExtension/DPAS_Static_Size_BF16.mlir
+++ b/test/SPIRV/IntelVectorExtension/DPAS_Static_Size_BF16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 /// A simple Matrix Multiplication using DPAS instruction
 /// A and B are in bf16, while the result C is f32
diff --git a/test/SPIRV/IntelVectorExtension/DPAS_raw_send.mlir b/test/SPIRV/IntelVectorExtension/DPAS_raw_send.mlir
index a02705a56..6e988b9f5 100644
--- a/test/SPIRV/IntelVectorExtension/DPAS_raw_send.mlir
+++ b/test/SPIRV/IntelVectorExtension/DPAS_raw_send.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 /// A simple Matrix Multiplication using DPAS instruction
diff --git a/test/SPIRV/IntelVectorExtension/Load_1d_raw_send.mlir b/test/SPIRV/IntelVectorExtension/Load_1d_raw_send.mlir
index eb27e5744..ca28c8782 100644
--- a/test/SPIRV/IntelVectorExtension/Load_1d_raw_send.mlir
+++ b/test/SPIRV/IntelVectorExtension/Load_1d_raw_send.mlir
@@ -6,7 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
-
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 // A simple test case showing how to use raw_send2 VC intrinsics for doing a load1d
 
 module attributes {gpu.container_module}  {
diff --git a/test/SPIRV/IntelVectorExtension/Load_1d_slm.mlir b/test/SPIRV/IntelVectorExtension/Load_1d_slm.mlir
index f4f9a7627..163ad65cb 100644
--- a/test/SPIRV/IntelVectorExtension/Load_1d_slm.mlir
+++ b/test/SPIRV/IntelVectorExtension/Load_1d_slm.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 module attributes {gpu.container_module}  {
diff --git a/test/SPIRV/IntelVectorExtension/Load_2d_raw_send.mlir b/test/SPIRV/IntelVectorExtension/Load_2d_raw_send.mlir
index 9f44d7fcf..77dec0cb5 100644
--- a/test/SPIRV/IntelVectorExtension/Load_2d_raw_send.mlir
+++ b/test/SPIRV/IntelVectorExtension/Load_2d_raw_send.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 /// A simple load2d/store2d example
 /// This example loads and stores 16x16xf32 elements using raw_send2/store2d
diff --git a/test/SPIRV/IntelVectorExtension/Store2d_raw_send.mlir b/test/SPIRV/IntelVectorExtension/Store2d_raw_send.mlir
index e06d80ae0..e07c13314 100644
--- a/test/SPIRV/IntelVectorExtension/Store2d_raw_send.mlir
+++ b/test/SPIRV/IntelVectorExtension/Store2d_raw_send.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 /// A simple load2d/store2d example
 /// This example loads and stores 16x16xf32 elements using load2d/raw_sends2

From 1c6b1f25acdbfc618666d5fd56c5010808340605 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Mon, 1 Jul 2024 08:43:11 +0000
Subject: [PATCH 04/10] fix wrapper lifetime

---
 .../OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp    | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
index 9d424dfd3..f57c54edd 100644
--- a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
+++ b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
@@ -30,6 +30,7 @@
 #include <atomic>
 #include <map>
 #include <mutex>
+#include <thread>
 
 #ifdef _WIN32
 #define SYCL_RUNTIME_EXPORT __declspec(dllexport)
@@ -61,8 +62,7 @@ namespace {
 constexpr char DeviceMemAllocName[] = "clDeviceMemAllocINTEL";
 constexpr char SharedMemAllocName[] = "clSharedMemAllocINTEL";
 constexpr char MemBlockingFreeName[] = "clMemBlockingFreeINTEL";
-constexpr char SetKernelArgMemPointerName[] =
-    "clSetKernelArgMemPointerINTEL";
+constexpr char SetKernelArgMemPointerName[] = "clSetKernelArgMemPointerINTEL";
 static constexpr char EnqueueMemcpyName[] = "clEnqueueMemcpyINTEL";
 
 void *queryCLExtFunc(cl_platform_id CurPlatform, const char *FuncName) {
@@ -367,16 +367,28 @@ static void launchKernel(GPUCLQUEUE *queue, cl_kernel kernel, size_t gridX,
                                       globalSize, localSize, 0, NULL, NULL));
 }
 
+static GPUCLQUEUE *getDefaultQueue() {
+  static GPUCLQUEUE defaultq(static_cast<cl_device_id>(nullptr), nullptr,
+                             nullptr);
+  return &defaultq;
+}
+
 // Wrappers
 
 extern "C" SYCL_RUNTIME_EXPORT GPUCLQUEUE *gpuCreateStream(void *device,
                                                            void *context) {
+  // todo: this is a workaround of issue of gpux generating multiple streams
+  if (!device && !context) {
+    return getDefaultQueue();
+  }
   return new GPUCLQUEUE(reinterpret_cast<cl_device_id>(device),
                         reinterpret_cast<cl_context>(context), nullptr);
 }
 
 extern "C" SYCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUCLQUEUE *queue) {
-  delete queue;
+  // todo: this is a workaround of issue of gpux generating multiple streams
+  // should uncomment the below line to release the queue
+  // delete queue;
 }
 
 extern "C" SYCL_RUNTIME_EXPORT void *

From b991fff3783cafd0b3c49ee72fc39e31951de46c Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Mon, 1 Jul 2024 09:08:33 +0000
Subject: [PATCH 05/10] add tests

---
 .../Dialect/XeTile/block_broadcast_dim_0_fp32.mlir            | 4 ++++
 .../Dialect/XeTile/block_broadcast_dim_1_fp32.mlir            | 4 ++++
 test/Integration/Dialect/XeTile/block_reduce_dim_0_fp32.mlir  | 4 ++++
 test/Integration/Dialect/XeTile/block_reduce_dim_1_fp32.mlir  | 4 ++++
 test/Integration/Dialect/XeTile/block_softmax_dim_0_fp32.mlir | 4 ++++
 test/Integration/Dialect/XeTile/block_softmax_dim_1_fp32.mlir | 4 ++++
 .../Dialect/XeTile/sg_gemm_1kx1kx1k_bf16_bf16_f32.mlir        | 4 ++++
 .../XeTile/sg_gemm_1kx1kx1k_f16_f16_f16_blk_16x32x32.mlir     | 4 ++++
 .../Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32.mlir          | 4 ++++
 .../XeTile/sg_gemm_1kx1kx1k_f16_f16_f32_transpose_a.mlir      | 4 ++++
 .../XeTile/sg_gemm_1kx1kx1k_f16_f16_f32_transpose_b.mlir      | 4 ++++
 .../XeTile/sg_gemm_1kx1kx1k_f32_f32_f32_with_truncf_a_b.mlir  | 4 ++++
 .../XeTile/sg_gemm_1kx1kx1k_preop_postop_bf16_bf16_f32.mlir   | 4 ++++
 ...gemm_1kx1kx1k_transposed_b_preop_postop_bf16_bf16_f32.mlir | 4 ++++
 test/Integration/Dialect/XeTile/sg_gemm_postop.mlir           | 4 ++++
 test/Integration/Dialect/XeTile/sg_gemm_pre_broadcast_a.mlir  | 4 ++++
 test/Integration/Dialect/XeTile/sg_gemm_pre_broadcast_b.mlir  | 4 ++++
 test/Integration/Dialect/XeTile/sg_gemm_preop_a.mlir          | 4 ++++
 test/Integration/Dialect/XeTile/sg_gemm_preop_a_b.mlir        | 4 ++++
 test/Integration/Dialect/XeTile/sg_gemm_preop_b.mlir          | 4 ++++
 .../Dialect/XeTile/transpose_1kx1kx1k_f16_f16_f16.mlir        | 4 ++++
 test/SPIRV/IntelVectorExtension/GEMM_4kx4kx4k_BF16.mlir       | 4 ++++
 ...rix_Physical_64_addressing_matrixUse_Param_level_zero.mlir | 4 ++++
 test/SPIRV/OpTest.ArgMax_BF16.mlir                            | 4 ++++
 test/SPIRV/OpTest.Argmax_FLOAT32.mlir                         | 4 ++++
 test/SPIRV/OpTest.BroadcastNonNumpy_BF16.mlir                 | 4 ++++
 test/SPIRV/OpTest.BroadcastNonNumpy_FLOAT32.mlir              | 4 ++++
 test/SPIRV/OpTest.Conv2D_FLOAT32.mlir                         | 4 ++++
 test/SPIRV/OpTest.EltwiseAdd_BF16.mlir                        | 4 ++++
 test/SPIRV/OpTest.EltwiseAdd_FLOAT32.mlir                     | 4 ++++
 test/SPIRV/OpTest.ExplicitPadding_FLOAT32.mlir                | 4 ++++
 test/SPIRV/OpTest.GEMM_BF16.mlir                              | 4 ++++
 test/SPIRV/OpTest.GEMM_BF16_ACC_F32.mlir                      | 4 ++++
 test/SPIRV/OpTest.GEMM_F16_ACC_F32.mlir                       | 4 ++++
 test/SPIRV/OpTest.MaxPool1D_INT64.mlir                        | 4 ++++
 test/SPIRV/OpTest.Quantize_FLOAT32.mlir                       | 4 ++++
 test/SPIRV/OpTest.Relu_FLOAT32.mlir                           | 4 ++++
 test/SPIRV/OpTest.SlmDynamic.mlir                             | 4 ++++
 test/SPIRV/OpTest.Softmax_FLOAT32.mlir                        | 4 ++++
 test/SPIRV/OpTest.Sum_FLOAT32.mlir                            | 4 ++++
 test/SPIRV/OpTest.Transpose_FLOAT32.mlir                      | 4 ++++
 41 files changed, 164 insertions(+)

diff --git a/test/Integration/Dialect/XeTile/block_broadcast_dim_0_fp32.mlir b/test/Integration/Dialect/XeTile/block_broadcast_dim_0_fp32.mlir
index 5ee8c44d5..a23998c33 100644
--- a/test/Integration/Dialect/XeTile/block_broadcast_dim_0_fp32.mlir
+++ b/test/Integration/Dialect/XeTile/block_broadcast_dim_0_fp32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @softmax attributes {gpu.container_module} {
   func.func @broadcast_test() -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeTile/block_broadcast_dim_1_fp32.mlir b/test/Integration/Dialect/XeTile/block_broadcast_dim_1_fp32.mlir
index c7491fc0d..ccd69ae72 100644
--- a/test/Integration/Dialect/XeTile/block_broadcast_dim_1_fp32.mlir
+++ b/test/Integration/Dialect/XeTile/block_broadcast_dim_1_fp32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @softmax attributes {gpu.container_module} {
   func.func @broadcast_test() -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeTile/block_reduce_dim_0_fp32.mlir b/test/Integration/Dialect/XeTile/block_reduce_dim_0_fp32.mlir
index 0d5076100..d93008947 100644
--- a/test/Integration/Dialect/XeTile/block_reduce_dim_0_fp32.mlir
+++ b/test/Integration/Dialect/XeTile/block_reduce_dim_0_fp32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @softmax attributes {gpu.container_module} {
   func.func @reduce_test(%a: memref<16x1024xf32>) -> memref<1x1024xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeTile/block_reduce_dim_1_fp32.mlir b/test/Integration/Dialect/XeTile/block_reduce_dim_1_fp32.mlir
index 3183930d8..72b437a46 100644
--- a/test/Integration/Dialect/XeTile/block_reduce_dim_1_fp32.mlir
+++ b/test/Integration/Dialect/XeTile/block_reduce_dim_1_fp32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @softmax attributes {gpu.container_module} {
   func.func @reduce_test(%a: memref<1024x32xf32>) -> memref<1x1024xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeTile/block_softmax_dim_0_fp32.mlir b/test/Integration/Dialect/XeTile/block_softmax_dim_0_fp32.mlir
index 6613172d3..f807c31ff 100644
--- a/test/Integration/Dialect/XeTile/block_softmax_dim_0_fp32.mlir
+++ b/test/Integration/Dialect/XeTile/block_softmax_dim_0_fp32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @softmax attributes {gpu.container_module} {
   func.func @block_softmax_test(%a: memref<1024x1024xf32>) -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeTile/block_softmax_dim_1_fp32.mlir b/test/Integration/Dialect/XeTile/block_softmax_dim_1_fp32.mlir
index 317d42576..2667eac86 100644
--- a/test/Integration/Dialect/XeTile/block_softmax_dim_1_fp32.mlir
+++ b/test/Integration/Dialect/XeTile/block_softmax_dim_1_fp32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @block_softmax attributes {gpu.container_module} {
   func.func @block_softmax_test(%a: memref<1024x1024xf32>) -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_bf16_bf16_f32.mlir b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_bf16_bf16_f32.mlir
index 1c26bab1f..a182b126c 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_bf16_bf16_f32.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_bf16_bf16_f32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 // NOTES :
 // This example assumes one subgroup per one workgroup and the kernel specifies the computation
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f16_blk_16x32x32.mlir b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f16_blk_16x32x32.mlir
index 5ef63af49..af115d8a9 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f16_blk_16x32x32.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f16_blk_16x32x32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 // NOTES :
 // This example assumes one subgroup per one workgroup and the kernel specifies the computation
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32.mlir b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32.mlir
index a3bd39dd1..0f03f17e4 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 // NOTES :
 // This example assumes one subgroup per one workgroup and the kernel specifies the computation
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32_transpose_a.mlir b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32_transpose_a.mlir
index 6622e4223..cef16d1d6 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32_transpose_a.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32_transpose_a.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 // NOTES :
 // This example assumes one subgroup per one workgroup and the kernel specifies the computation
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32_transpose_b.mlir b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32_transpose_b.mlir
index 14a52dbb1..e796e40fd 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32_transpose_b.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f16_f16_f32_transpose_b.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 // NOTES :
 // This example assumes one subgroup per one workgroup and the kernel specifies the computation
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f32_f32_f32_with_truncf_a_b.mlir b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f32_f32_f32_with_truncf_a_b.mlir
index 21d9e4664..bfb264cc4 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f32_f32_f32_with_truncf_a_b.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_f32_f32_f32_with_truncf_a_b.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 // NOTES :
 // This example assumes one subgroup per one workgroup and the kernel specifies the computation
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_preop_postop_bf16_bf16_f32.mlir b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_preop_postop_bf16_bf16_f32.mlir
index eb0406c55..896dc826d 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_preop_postop_bf16_bf16_f32.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_preop_postop_bf16_bf16_f32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: IMEX_ENABLE_LARGE_REG_FILE=1 %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,opencl_runtime --filecheck
 
 // NOTES :
 // This example assumes one subgroup per one workgroup and the kernel specifies the computation
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_transposed_b_preop_postop_bf16_bf16_f32.mlir b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_transposed_b_preop_postop_bf16_bf16_f32.mlir
index 0b20ac34a..1b0ad482a 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_transposed_b_preop_postop_bf16_bf16_f32.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_transposed_b_preop_postop_bf16_bf16_f32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: IMEX_ENABLE_LARGE_REG_FILE=1 %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,opencl_runtime --filecheck
 
 // NOTES :
 // This example assumes one subgroup per one workgroup and the kernel specifies the computation
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_postop.mlir b/test/Integration/Dialect/XeTile/sg_gemm_postop.mlir
index 36d794c36..d5c5e0ed6 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_postop.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_postop.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_pre_broadcast_a.mlir b/test/Integration/Dialect/XeTile/sg_gemm_pre_broadcast_a.mlir
index 965c2e65d..aa50d10e9 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_pre_broadcast_a.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_pre_broadcast_a.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>, %bcast: memref<1x1024xf16>) -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_pre_broadcast_b.mlir b/test/Integration/Dialect/XeTile/sg_gemm_pre_broadcast_b.mlir
index ba730b0b4..0fb02d40f 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_pre_broadcast_b.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_pre_broadcast_b.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>, %bcast: memref<1x1024xf16>) -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_preop_a.mlir b/test/Integration/Dialect/XeTile/sg_gemm_preop_a.mlir
index a83e8dcfb..cb1da255b 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_preop_a.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_preop_a.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_preop_a_b.mlir b/test/Integration/Dialect/XeTile/sg_gemm_preop_a_b.mlir
index 79fb4efde..6ff19624a 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_preop_a_b.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_preop_a_b.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_preop_b.mlir b/test/Integration/Dialect/XeTile/sg_gemm_preop_b.mlir
index 011c8ce9b..78bb3e1e1 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_preop_b.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_preop_b.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
diff --git a/test/Integration/Dialect/XeTile/transpose_1kx1kx1k_f16_f16_f16.mlir b/test/Integration/Dialect/XeTile/transpose_1kx1kx1k_f16_f16_f16.mlir
index 5901acefb..3e4697675 100644
--- a/test/Integration/Dialect/XeTile/transpose_1kx1kx1k_f16_f16_f16.mlir
+++ b/test/Integration/Dialect/XeTile/transpose_1kx1kx1k_f16_f16_f16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 // NOTES :
 // This example assumes one subgroup per one workgroup and the kernel specifies the computation
diff --git a/test/SPIRV/IntelVectorExtension/GEMM_4kx4kx4k_BF16.mlir b/test/SPIRV/IntelVectorExtension/GEMM_4kx4kx4k_BF16.mlir
index 5582f6ab4..2523dd4b6 100644
--- a/test/SPIRV/IntelVectorExtension/GEMM_4kx4kx4k_BF16.mlir
+++ b/test/SPIRV/IntelVectorExtension/GEMM_4kx4kx4k_BF16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: IMEX_ENABLE_LARGE_REG_FILE=1 %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module attributes {gpu.container_module}  {
 
diff --git a/test/SPIRV/JointMatrix/gemm_using_joint_matrix_Physical_64_addressing_matrixUse_Param_level_zero.mlir b/test/SPIRV/JointMatrix/gemm_using_joint_matrix_Physical_64_addressing_matrixUse_Param_level_zero.mlir
index 288db45e1..9bfdfc848 100644
--- a/test/SPIRV/JointMatrix/gemm_using_joint_matrix_Physical_64_addressing_matrixUse_Param_level_zero.mlir
+++ b/test/SPIRV/JointMatrix/gemm_using_joint_matrix_Physical_64_addressing_matrixUse_Param_level_zero.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 // CHECK-COUNT-4194304: 4970.23
 module @gemm_using_jointmatrix_module attributes {gpu.container_module} {
   memref.global "private" constant @__constant_A_2048x2048xbf16 : memref<2048x2048xbf16> = dense<1.100000e+00>
diff --git a/test/SPIRV/OpTest.ArgMax_BF16.mlir b/test/SPIRV/OpTest.ArgMax_BF16.mlir
index 3e634bcfd..c22985981 100644
--- a/test/SPIRV/OpTest.ArgMax_BF16.mlir
+++ b/test/SPIRV/OpTest.ArgMax_BF16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @argmax attributes {gpu.container_module} {
   memref.global "private" constant @__constant_1x4x4x3xbf16 : memref<1x4x4x3xbf16> = dense<[[[[9.000000e+00, 8.000000e+00, 0.000000e+00], [1.000000e+00, 5.000000e+00, 0.000000e+00], [1.000000e+00, 1.000000e+00, 7.000000e+00], [8.000000e+00, 2.000000e+00, 2.000000e+00]], [[8.000000e+00, 0.000000e+00, 4.000000e+00], [7.000000e+00, 5.000000e+00, 5.000000e+00], [8.000000e+00, 2.000000e+00, 0.000000e+00], [0.000000e+00, 9.000000e+00, 5.000000e+00]], [[4.000000e+00, 7.000000e+00, 2.000000e+00], [4.000000e+00, 5.000000e+00, 1.000000e+00], [3.000000e+00, 3.000000e+00, 6.000000e+00], [8.000000e+00, 0.000000e+00, 1.000000e+00]], [[2.000000e+00, 8.000000e+00, 4.000000e+00], [0.000000e+00, 5.000000e+00, 5.000000e+00], [6.000000e+00, 1.000000e+00, 1.000000e+00], [3.000000e+00, 3.000000e+00, 1.000000e+00]]]]>
diff --git a/test/SPIRV/OpTest.Argmax_FLOAT32.mlir b/test/SPIRV/OpTest.Argmax_FLOAT32.mlir
index 5555853d1..6af9f9a8d 100644
--- a/test/SPIRV/OpTest.Argmax_FLOAT32.mlir
+++ b/test/SPIRV/OpTest.Argmax_FLOAT32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @argmax attributes {gpu.container_module} {
   memref.global "private" constant @__constant_1x4x4x3xf32 : memref<1x4x4x3xf32> = dense<[[[[9.000000e+00, 8.000000e+00, 0.000000e+00], [1.000000e+00, 5.000000e+00, 0.000000e+00], [1.000000e+00, 1.000000e+00, 7.000000e+00], [8.000000e+00, 2.000000e+00, 2.000000e+00]], [[8.000000e+00, 0.000000e+00, 4.000000e+00], [7.000000e+00, 5.000000e+00, 5.000000e+00], [8.000000e+00, 2.000000e+00, 0.000000e+00], [0.000000e+00, 9.000000e+00, 5.000000e+00]], [[4.000000e+00, 7.000000e+00, 2.000000e+00], [4.000000e+00, 5.000000e+00, 1.000000e+00], [3.000000e+00, 3.000000e+00, 6.000000e+00], [8.000000e+00, 0.000000e+00, 1.000000e+00]], [[2.000000e+00, 8.000000e+00, 4.000000e+00], [0.000000e+00, 5.000000e+00, 5.000000e+00], [6.000000e+00, 1.000000e+00, 1.000000e+00], [3.000000e+00, 3.000000e+00, 1.000000e+00]]]]>
diff --git a/test/SPIRV/OpTest.BroadcastNonNumpy_BF16.mlir b/test/SPIRV/OpTest.BroadcastNonNumpy_BF16.mlir
index ac7c92939..c31119d2a 100644
--- a/test/SPIRV/OpTest.BroadcastNonNumpy_BF16.mlir
+++ b/test/SPIRV/OpTest.BroadcastNonNumpy_BF16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @broadcast_non_numpy attributes {gpu.container_module} {
    memref.global "private" constant @__constant_3xbf16 : memref<3xbf16> = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00]>
diff --git a/test/SPIRV/OpTest.BroadcastNonNumpy_FLOAT32.mlir b/test/SPIRV/OpTest.BroadcastNonNumpy_FLOAT32.mlir
index 8e8490994..8df9effff 100644
--- a/test/SPIRV/OpTest.BroadcastNonNumpy_FLOAT32.mlir
+++ b/test/SPIRV/OpTest.BroadcastNonNumpy_FLOAT32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @broadcast_non_numpy attributes {gpu.container_module} {
   memref.global "private" constant @__constant_3xf32 : memref<3xf32> = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00]>
diff --git a/test/SPIRV/OpTest.Conv2D_FLOAT32.mlir b/test/SPIRV/OpTest.Conv2D_FLOAT32.mlir
index 2879b769e..618ec5d84 100644
--- a/test/SPIRV/OpTest.Conv2D_FLOAT32.mlir
+++ b/test/SPIRV/OpTest.Conv2D_FLOAT32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 module @complex_conv_2d attributes {gpu.container_module} {
diff --git a/test/SPIRV/OpTest.EltwiseAdd_BF16.mlir b/test/SPIRV/OpTest.EltwiseAdd_BF16.mlir
index 035c1d1b8..becb4eefc 100644
--- a/test/SPIRV/OpTest.EltwiseAdd_BF16.mlir
+++ b/test/SPIRV/OpTest.EltwiseAdd_BF16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @eltwise_add attributes {gpu.container_module} {
   memref.global "private" constant @__constant_10x20xbf16 : memref<10x20xbf16> = dense<[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0],
                                [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0],
diff --git a/test/SPIRV/OpTest.EltwiseAdd_FLOAT32.mlir b/test/SPIRV/OpTest.EltwiseAdd_FLOAT32.mlir
index dd2f97998..051acdd40 100644
--- a/test/SPIRV/OpTest.EltwiseAdd_FLOAT32.mlir
+++ b/test/SPIRV/OpTest.EltwiseAdd_FLOAT32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 module @eltwise_add attributes {gpu.container_module} {
diff --git a/test/SPIRV/OpTest.ExplicitPadding_FLOAT32.mlir b/test/SPIRV/OpTest.ExplicitPadding_FLOAT32.mlir
index 3f01dc794..5d44cde8a 100644
--- a/test/SPIRV/OpTest.ExplicitPadding_FLOAT32.mlir
+++ b/test/SPIRV/OpTest.ExplicitPadding_FLOAT32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 module @explicit_padding attributes {gpu.container_module} {
diff --git a/test/SPIRV/OpTest.GEMM_BF16.mlir b/test/SPIRV/OpTest.GEMM_BF16.mlir
index 54eaf982a..884f796c3 100644
--- a/test/SPIRV/OpTest.GEMM_BF16.mlir
+++ b/test/SPIRV/OpTest.GEMM_BF16.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @gemm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_3x3xbf16_result : memref<3x3xbf16> = dense<1.000000e+00>
diff --git a/test/SPIRV/OpTest.GEMM_BF16_ACC_F32.mlir b/test/SPIRV/OpTest.GEMM_BF16_ACC_F32.mlir
index 1347ff0ea..3eae1b633 100644
--- a/test/SPIRV/OpTest.GEMM_BF16_ACC_F32.mlir
+++ b/test/SPIRV/OpTest.GEMM_BF16_ACC_F32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @gemm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_3x3xf32 : memref<3x3xf32> = dense<1.000000e+00>
diff --git a/test/SPIRV/OpTest.GEMM_F16_ACC_F32.mlir b/test/SPIRV/OpTest.GEMM_F16_ACC_F32.mlir
index 4d9675596..bf32c16f8 100644
--- a/test/SPIRV/OpTest.GEMM_F16_ACC_F32.mlir
+++ b/test/SPIRV/OpTest.GEMM_F16_ACC_F32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 module @gemm attributes {gpu.container_module} {
diff --git a/test/SPIRV/OpTest.MaxPool1D_INT64.mlir b/test/SPIRV/OpTest.MaxPool1D_INT64.mlir
index c84cf00ea..9303564a5 100644
--- a/test/SPIRV/OpTest.MaxPool1D_INT64.mlir
+++ b/test/SPIRV/OpTest.MaxPool1D_INT64.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 module @max_pool_1d attributes {gpu.container_module} {
diff --git a/test/SPIRV/OpTest.Quantize_FLOAT32.mlir b/test/SPIRV/OpTest.Quantize_FLOAT32.mlir
index 1dfb018bc..b1db7a019 100644
--- a/test/SPIRV/OpTest.Quantize_FLOAT32.mlir
+++ b/test/SPIRV/OpTest.Quantize_FLOAT32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 module @quantize attributes {gpu.container_module} {
diff --git a/test/SPIRV/OpTest.Relu_FLOAT32.mlir b/test/SPIRV/OpTest.Relu_FLOAT32.mlir
index 942f67dc0..8cb3117ee 100644
--- a/test/SPIRV/OpTest.Relu_FLOAT32.mlir
+++ b/test/SPIRV/OpTest.Relu_FLOAT32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 module @relu attributes {gpu.container_module} {
diff --git a/test/SPIRV/OpTest.SlmDynamic.mlir b/test/SPIRV/OpTest.SlmDynamic.mlir
index a96336d3e..e148cfd5b 100644
--- a/test/SPIRV/OpTest.SlmDynamic.mlir
+++ b/test/SPIRV/OpTest.SlmDynamic.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @slm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_4x128xf32 : memref<4x128xf32> = dense<[
diff --git a/test/SPIRV/OpTest.Softmax_FLOAT32.mlir b/test/SPIRV/OpTest.Softmax_FLOAT32.mlir
index bf8a8133a..11285a4ba 100644
--- a/test/SPIRV/OpTest.Softmax_FLOAT32.mlir
+++ b/test/SPIRV/OpTest.Softmax_FLOAT32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 module @softmax attributes {gpu.container_module} {
diff --git a/test/SPIRV/OpTest.Sum_FLOAT32.mlir b/test/SPIRV/OpTest.Sum_FLOAT32.mlir
index 8b11d4e89..0f57e77eb 100644
--- a/test/SPIRV/OpTest.Sum_FLOAT32.mlir
+++ b/test/SPIRV/OpTest.Sum_FLOAT32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 module @sum attributes {gpu.container_module} {
diff --git a/test/SPIRV/OpTest.Transpose_FLOAT32.mlir b/test/SPIRV/OpTest.Transpose_FLOAT32.mlir
index 73ef05c1e..3883487df 100644
--- a/test/SPIRV/OpTest.Transpose_FLOAT32.mlir
+++ b/test/SPIRV/OpTest.Transpose_FLOAT32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 module @transpose attributes {gpu.container_module} {

From f36d4a5f374dc334ed7ea516fff070e1c2df2f82 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Tue, 2 Jul 2024 02:52:24 +0000
Subject: [PATCH 06/10] make all UTs happy: fix group size

---
 lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt                | 1 -
 lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp     | 2 +-
 .../XeTile/sg_gemm_1kx1kx1k_preop_postop_bf16_bf16_f32.mlir     | 2 +-
 ...g_gemm_1kx1kx1k_transposed_b_preop_postop_bf16_bf16_f32.mlir | 2 +-
 4 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt b/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt
index 053c4f916..be672e48c 100644
--- a/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt
+++ b/lib/ExecutionEngine/OPENCLRUNTIME/CMakeLists.txt
@@ -38,4 +38,3 @@ target_include_directories(opencl-runtime PRIVATE
 
 message(STATUS "OpenCL Libraries: ${OpenCL_LIBRARIES}")
 target_link_libraries(opencl-runtime PRIVATE ${OpenCL_LIBRARIES})
-
diff --git a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
index f57c54edd..0983a8547 100644
--- a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
+++ b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
@@ -361,7 +361,7 @@ static void launchKernel(GPUCLQUEUE *queue, cl_kernel kernel, size_t gridX,
   if (sharedMemBytes) {
     CL_SAFE_CALL(clSetKernelArg(kernel, paramsCount, sharedMemBytes, nullptr));
   }
-  size_t globalSize[3] = {gridX, gridY, gridZ};
+  size_t globalSize[3] = {gridX * blockX, gridY * blockY, gridZ * blockZ};
   size_t localSize[3] = {blockX, blockY, blockZ};
   CL_SAFE_CALL(clEnqueueNDRangeKernel(queue->queue_, kernel, 3, NULL,
                                       globalSize, localSize, 0, NULL, NULL));
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_preop_postop_bf16_bf16_f32.mlir b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_preop_postop_bf16_bf16_f32.mlir
index 896dc826d..e238b8c1f 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_preop_postop_bf16_bf16_f32.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_preop_postop_bf16_bf16_f32.mlir
@@ -9,7 +9,7 @@
 // RUN: IMEX_ENABLE_LARGE_REG_FILE=1 %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
-// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,opencl_runtime --filecheck
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 // NOTES :
 // This example assumes one subgroup per one workgroup and the kernel specifies the computation
diff --git a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_transposed_b_preop_postop_bf16_bf16_f32.mlir b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_transposed_b_preop_postop_bf16_bf16_f32.mlir
index 1b0ad482a..ca0e57eff 100644
--- a/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_transposed_b_preop_postop_bf16_bf16_f32.mlir
+++ b/test/Integration/Dialect/XeTile/sg_gemm_1kx1kx1k_transposed_b_preop_postop_bf16_bf16_f32.mlir
@@ -9,7 +9,7 @@
 // RUN: IMEX_ENABLE_LARGE_REG_FILE=1 %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xetile-to-func-vc.pp \
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
-// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,opencl_runtime --filecheck
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 // NOTES :
 // This example assumes one subgroup per one workgroup and the kernel specifies the computation

From 4d15cab8f2ee5ef1710faa8c7200fbf351473ebe Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Tue, 2 Jul 2024 03:27:13 +0000
Subject: [PATCH 07/10] more tests

---
 .../Dialect/Func/gemm-with-func-and-intel-intrinsic.mlir    | 4 ++++
 .../Dialect/Func/load2d_dpas_store2d_with_intrinsic.mlir    | 4 ++++
 test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir             | 4 ++++
 .../XeGPU/gemm_4kx4kx4k_dpas_sized_loads_f16_f16_f32.mlir   | 4 ++++
 .../Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir | 4 ++++
 test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir             | 4 ++++
 test/Jax/janet/jit_prim_fun.50_linalg.mlir                  | 4 ++++
 test/Jax/jax_qmc/jit__linspace.39_linalg.mlir               | 4 ++++
 test/Jax/jax_qmc/jit__mean.46_linalg.mlir                   | 4 ++++
 test/Jax/jax_qmc/jit__mean.51_linalg.mlir                   | 4 ++++
 test/Jax/jax_qmc/jit_pionless_2b_lo.41_linalg.mlir          | 4 ++++
 test/Jax/jax_qmc/jit_prim_fun.13_linalg.mlir                | 4 ++++
 test/Jax/jax_qmc/jit_v_em.42_linalg.mlir                    | 4 ++++
 test/Jax/qoc/jit_conjugate.307_linalg.mlir                  | 4 ++++
 test/Jax/qoc/jit_prim_fun.335_linalg.mlir                   | 4 ++++
 test/Jax/qoc/jit_real.364_linalg.mlir                       | 4 ++++
 test/Jax/qoc/jit_true_divide.316_linalg.mlir                | 4 ++++
 .../Resnet-50/resnet-50-linalg-without-tensor-pad.mlir      | 4 ++++
 test/Models/Resnet-50/resnet-50-linalg.mlir                 | 4 ++++
 test/PlaidML/CppEdsl.Add.mlir                               | 4 ++++
 test/PlaidML/CppEdsl.Atan.mlir                              | 6 +++++-
 test/PlaidML/OpTest.GEMM_FLOAT32.mlir                       | 4 ++++
 test/PlaidML/OpTest.HigherPrecisioConstants.mlir            | 4 ++++
 test/PlaidML/OpTest.UniqueNames.mlir                        | 6 +++++-
 test/SPIRV/OpTest.spirv.CL.printf.mlir                      | 4 ++++
 test/SPIRV/relu.slm.static.8x32.mlir                        | 4 ++++
 26 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/test/Integration/Dialect/Func/gemm-with-func-and-intel-intrinsic.mlir b/test/Integration/Dialect/Func/gemm-with-func-and-intel-intrinsic.mlir
index cd2f0e2dd..fd6771189 100644
--- a/test/Integration/Dialect/Func/gemm-with-func-and-intel-intrinsic.mlir
+++ b/test/Integration/Dialect/Func/gemm-with-func-and-intel-intrinsic.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/func-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @gemm attributes {gpu.container_module,
 spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
diff --git a/test/Integration/Dialect/Func/load2d_dpas_store2d_with_intrinsic.mlir b/test/Integration/Dialect/Func/load2d_dpas_store2d_with_intrinsic.mlir
index 6c79f0a48..7ab954a5a 100644
--- a/test/Integration/Dialect/Func/load2d_dpas_store2d_with_intrinsic.mlir
+++ b/test/Integration/Dialect/Func/load2d_dpas_store2d_with_intrinsic.mlir
@@ -7,6 +7,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/func-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @gemm attributes {gpu.container_module} {
   memref.global "private" @__constant_8x16xf16 : memref<8x16xf16> = dense<1.000000e+00>
diff --git a/test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir b/test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir
index b2bb6829a..8e4b21a62 100644
--- a/test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp  \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<8x32xf16>, %B: memref<16x32xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
diff --git a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_dpas_sized_loads_f16_f16_f32.mlir b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_dpas_sized_loads_f16_f16_f32.mlir
index e0ee10a29..bcc2e5307 100644
--- a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_dpas_sized_loads_f16_f16_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_dpas_sized_loads_f16_f16_f32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: IMEX_ENABLE_LARGE_REG_FILE=1 %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 module @gemm attributes {gpu.container_module} {
   func.func @test(%A: memref<4096x4096xf16>, %B: memref<4096x4096xf16>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> attributes {llvm.emit_c_interface} {
diff --git a/test/Integration/Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir b/test/Integration/Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir
index 53f036197..43051899d 100644
--- a/test/Integration/Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm-joint-matrix.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 // NOTE: This test case provides an end-to-end example of XeGPU SIMT mode ops to SPIR-V JointMatrix ops lowering
 
diff --git a/test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir b/test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir
index fe13bcc47..4d45db375 100644
--- a/test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir
+++ b/test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir
@@ -6,6 +6,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp  \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @gemm attributes {gpu.container_module,
 spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
   memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01>
diff --git a/test/Jax/janet/jit_prim_fun.50_linalg.mlir b/test/Jax/janet/jit_prim_fun.50_linalg.mlir
index 05514c173..8bbbc45aa 100644
--- a/test/Jax/janet/jit_prim_fun.50_linalg.mlir
+++ b/test/Jax/janet/jit_prim_fun.50_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN-GPU:                                        --runner imex-cpu-runner -e main \
 // RUN-GPU:                                        --entry-point-result=void \
 // RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN-GPU: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN-GPU:                                        --runner imex-cpu-runner -e main \
+// RUN-GPU:                                        --entry-point-result=void \
+// RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @jit_prim_fun.50 {
 
diff --git a/test/Jax/jax_qmc/jit__linspace.39_linalg.mlir b/test/Jax/jax_qmc/jit__linspace.39_linalg.mlir
index 2c6cc8273..6eea5c2b2 100644
--- a/test/Jax/jax_qmc/jit__linspace.39_linalg.mlir
+++ b/test/Jax/jax_qmc/jit__linspace.39_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN-GPU:                                        --runner imex-cpu-runner -e main \
 // RUN-GPU:                                        --entry-point-result=void \
 // RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN-GPU: %python_executable %imex_runner --requires=opencl-runtime,igpu-fp64 %igpu_fp64 -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN-GPU:                                        --runner imex-cpu-runner -e main \
+// RUN-GPU:                                        --entry-point-result=void \
+// RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<() -> ()>
 #map1 = affine_map<(d0) -> ()>
 #map2 = affine_map<(d0) -> (d0)>
diff --git a/test/Jax/jax_qmc/jit__mean.46_linalg.mlir b/test/Jax/jax_qmc/jit__mean.46_linalg.mlir
index 7c22f5931..3c0e2d753 100644
--- a/test/Jax/jax_qmc/jit__mean.46_linalg.mlir
+++ b/test/Jax/jax_qmc/jit__mean.46_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime,igpu-fp64 %igpu_fp64 -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d2, d1)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
diff --git a/test/Jax/jax_qmc/jit__mean.51_linalg.mlir b/test/Jax/jax_qmc/jit__mean.51_linalg.mlir
index 77b814fa2..53d12620a 100644
--- a/test/Jax/jax_qmc/jit__mean.51_linalg.mlir
+++ b/test/Jax/jax_qmc/jit__mean.51_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime,igpu-fp64 %igpu_fp64 -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> ()>
 #map2 = affine_map<() -> ()>
diff --git a/test/Jax/jax_qmc/jit_pionless_2b_lo.41_linalg.mlir b/test/Jax/jax_qmc/jit_pionless_2b_lo.41_linalg.mlir
index a43b567a1..219079953 100644
--- a/test/Jax/jax_qmc/jit_pionless_2b_lo.41_linalg.mlir
+++ b/test/Jax/jax_qmc/jit_pionless_2b_lo.41_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime,igpu-fp64 %igpu_fp64 -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<() -> ()>
 #map1 = affine_map<(d0) -> (d0)>
 #map2 = affine_map<(d0) -> (0)>
diff --git a/test/Jax/jax_qmc/jit_prim_fun.13_linalg.mlir b/test/Jax/jax_qmc/jit_prim_fun.13_linalg.mlir
index 94f1a3ea7..7f53ad096 100644
--- a/test/Jax/jax_qmc/jit_prim_fun.13_linalg.mlir
+++ b/test/Jax/jax_qmc/jit_prim_fun.13_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN-GPU:                                        --runner imex-cpu-runner -e main \
 // RUN-GPU:                                        --entry-point-result=void \
 // RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN-GPU: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN-GPU:                                        --runner imex-cpu-runner -e main \
+// RUN-GPU:                                        --entry-point-result=void \
+// RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0) -> (d0)>
 module @jit_prim_fun.13 {
 
diff --git a/test/Jax/jax_qmc/jit_v_em.42_linalg.mlir b/test/Jax/jax_qmc/jit_v_em.42_linalg.mlir
index 3e430c779..e6d6c8b87 100644
--- a/test/Jax/jax_qmc/jit_v_em.42_linalg.mlir
+++ b/test/Jax/jax_qmc/jit_v_em.42_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime,igpu-fp64 %igpu_fp64 -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<() -> ()>
 module @jit_v_em.42 {
 
diff --git a/test/Jax/qoc/jit_conjugate.307_linalg.mlir b/test/Jax/qoc/jit_conjugate.307_linalg.mlir
index 2d7b6a4c1..4c3f9cc78 100644
--- a/test/Jax/qoc/jit_conjugate.307_linalg.mlir
+++ b/test/Jax/qoc/jit_conjugate.307_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN-GPU:                                        --runner imex-cpu-runner -e main \
 // RUN-GPU:                                        --entry-point-result=void \
 // RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN-GPU: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN-GPU:                                        --runner imex-cpu-runner -e main \
+// RUN-GPU:                                        --entry-point-result=void \
+// RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @jit_conjugate.307 {
   func.func private @callee(%arg0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xcomplex<f32>> {
diff --git a/test/Jax/qoc/jit_prim_fun.335_linalg.mlir b/test/Jax/qoc/jit_prim_fun.335_linalg.mlir
index 693714511..f3bb44dd6 100644
--- a/test/Jax/qoc/jit_prim_fun.335_linalg.mlir
+++ b/test/Jax/qoc/jit_prim_fun.335_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN-GPU:                                        --runner imex-cpu-runner -e main \
 // RUN-GPU:                                        --entry-point-result=void \
 // RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN-GPU: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN-GPU:                                        --runner imex-cpu-runner -e main \
+// RUN-GPU:                                        --entry-point-result=void \
+// RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0) -> (d0)>
 module @jit_prim_fun.335 {
 
diff --git a/test/Jax/qoc/jit_real.364_linalg.mlir b/test/Jax/qoc/jit_real.364_linalg.mlir
index 07753a7af..9849504c2 100644
--- a/test/Jax/qoc/jit_real.364_linalg.mlir
+++ b/test/Jax/qoc/jit_real.364_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN-GPU:                                        --runner imex-cpu-runner -e main \
 // RUN-GPU:                                        --entry-point-result=void \
 // RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN-GPU: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN-GPU:                                        --runner imex-cpu-runner -e main \
+// RUN-GPU:                                        --entry-point-result=void \
+// RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0) -> (d0)>
 module @jit_real.364 {
 
diff --git a/test/Jax/qoc/jit_true_divide.316_linalg.mlir b/test/Jax/qoc/jit_true_divide.316_linalg.mlir
index 22bac4239..7ad288d68 100644
--- a/test/Jax/qoc/jit_true_divide.316_linalg.mlir
+++ b/test/Jax/qoc/jit_true_divide.316_linalg.mlir
@@ -10,6 +10,10 @@
 // RUN-GPU:                                        --runner imex-cpu-runner -e main \
 // RUN-GPU:                                        --entry-point-result=void \
 // RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN-GPU: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN-GPU:                                        --runner imex-cpu-runner -e main \
+// RUN-GPU:                                        --entry-point-result=void \
+// RUN-GPU:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<() -> ()>
 module @jit_true_divide.316 {
   func.func private @callee(%arg0: tensor<complex<f32>>, %arg1: tensor<i32>) -> tensor<complex<f32>> {
diff --git a/test/Models/Resnet-50/resnet-50-linalg-without-tensor-pad.mlir b/test/Models/Resnet-50/resnet-50-linalg-without-tensor-pad.mlir
index 97d16de0c..d41ca5cc5 100644
--- a/test/Models/Resnet-50/resnet-50-linalg-without-tensor-pad.mlir
+++ b/test/Models/Resnet-50/resnet-50-linalg-without-tensor-pad.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
diff --git a/test/Models/Resnet-50/resnet-50-linalg.mlir b/test/Models/Resnet-50/resnet-50-linalg.mlir
index e89ce9f5a..b0a263120 100644
--- a/test/Models/Resnet-50/resnet-50-linalg.mlir
+++ b/test/Models/Resnet-50/resnet-50-linalg.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
diff --git a/test/PlaidML/CppEdsl.Add.mlir b/test/PlaidML/CppEdsl.Add.mlir
index 3da63bb55..2089b34e9 100644
--- a/test/PlaidML/CppEdsl.Add.mlir
+++ b/test/PlaidML/CppEdsl.Add.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm-caching.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @add {
 func.func @main() {
diff --git a/test/PlaidML/CppEdsl.Atan.mlir b/test/PlaidML/CppEdsl.Atan.mlir
index fe29fb7ae..acee0198d 100644
--- a/test/PlaidML/CppEdsl.Atan.mlir
+++ b/test/PlaidML/CppEdsl.Atan.mlir
@@ -9,7 +9,11 @@
 // RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
-// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @atan {
   func.func @test(%arg0: tensor<3x3xf32>) -> tensor<3x3xf32> {
diff --git a/test/PlaidML/OpTest.GEMM_FLOAT32.mlir b/test/PlaidML/OpTest.GEMM_FLOAT32.mlir
index bfebd5c21..c6842bb67 100644
--- a/test/PlaidML/OpTest.GEMM_FLOAT32.mlir
+++ b/test/PlaidML/OpTest.GEMM_FLOAT32.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm-caching.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
diff --git a/test/PlaidML/OpTest.HigherPrecisioConstants.mlir b/test/PlaidML/OpTest.HigherPrecisioConstants.mlir
index 3eff7140a..3be0d85b9 100644
--- a/test/PlaidML/OpTest.HigherPrecisioConstants.mlir
+++ b/test/PlaidML/OpTest.HigherPrecisioConstants.mlir
@@ -10,6 +10,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime,igpu-fp64 %igpu_fp64 -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> ()>
 module @higher_precision_constants {
diff --git a/test/PlaidML/OpTest.UniqueNames.mlir b/test/PlaidML/OpTest.UniqueNames.mlir
index 82bdce5b8..9433f86a1 100644
--- a/test/PlaidML/OpTest.UniqueNames.mlir
+++ b/test/PlaidML/OpTest.UniqueNames.mlir
@@ -9,7 +9,11 @@
 // RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
-// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 #map = affine_map<(d0) -> (d0)>
 module @unique_names {
 func.func @main() {
diff --git a/test/SPIRV/OpTest.spirv.CL.printf.mlir b/test/SPIRV/OpTest.spirv.CL.printf.mlir
index a7be187f6..9464792d3 100644
--- a/test/SPIRV/OpTest.spirv.CL.printf.mlir
+++ b/test/SPIRV/OpTest.spirv.CL.printf.mlir
@@ -2,6 +2,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 
 
 module @print_simple attributes {gpu.container_module} {
diff --git a/test/SPIRV/relu.slm.static.8x32.mlir b/test/SPIRV/relu.slm.static.8x32.mlir
index e73eda5fd..a1e07d3bb 100644
--- a/test/SPIRV/relu.slm.static.8x32.mlir
+++ b/test/SPIRV/relu.slm.static.8x32.mlir
@@ -2,6 +2,10 @@
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=opencl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime --filecheck
 module @test attributes {gpu.container_module} {
   memref.global "private" constant @__constant_8x32xf32 : memref<8x32xf32> = dense<[
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],

From 631e6bca50cd5e94e1271d3e5267de1b0dae5113 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Tue, 2 Jul 2024 03:46:14 +0000
Subject: [PATCH 08/10] update doc

---
 README.md                                         | 15 ++++++++++++++-
 .../OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp       |  2 +-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index bbf96eabf..fedd4221d 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 Intel® Extension for MLIR (IMEX) is a collection of MLIR dialects and passes from Intel for supporting MLIR lowering to Intel silicon (CPU, GPU, …). Goal of this project is to support development of MLIR enhancements for upstream contribution, and to provide a sandbox for validation independent of front end frameworks. Current project scope includes:
 
 * Dialects and passes needed to lower and execute MLIR entry dialect (linalg, CFG, and etc) on Intel GPU.
-* Wrapper libraries to inteface with level zero runtime and sycl runtime supporting Intel GPU.
+* Wrapper libraries to inteface with level zero runtime, sycl runtime and OpenCL runtime supporting Intel GPU.
 * Other experimental dialects: NDArray, Dist
 
 ## Requirements for building and development
@@ -44,6 +44,13 @@ cmake --build build --target install
 ```
 * Binary package for system-wide install: https://github.com/oneapi-src/level-zero/releases
 
+#### Getting OpenCL headers and libraries (For OpenCL runtime)
+ * Install using OS-provided package (Ubuntu 22.04)
+```sh
+sudo apt install -y intel-opencl-icd opencl-c-headers
+```
+ * Or, download and install package from: https://github.com/intel/compute-runtime/releases
+
 ### Example: Setting up requirements using Conda
 ```sh
 conda create -n imex-dev -c intel -c defaults -c conda-forge pip">=21.2.4" pre-commit cmake clang-format lit doxygen
@@ -80,6 +87,8 @@ cmake -G Ninja -B build -S llvm \
 #  -DIMEX_ENABLE_L0_RUNTIME=1
 #  -DIMEX_ENABLE_SYCL_RUNTIME=1
 #  -DIMEX_ENABLE_OPENCL_RUNTIME=1
+# Additional if OpenCL library is not found by CMake
+#  -DOpenCL_LIBRARY=/PATH_TO/libOpenCL.so.1  ## usually at /usr/lib/x86_64-linux-gnu/libOpenCL.so.1
 # Additional if using a non system wide Level Zero Loader built from source
 #  -DLEVEL_ZERO_DIR=/PATH_TO/level-zero-install
 
@@ -109,6 +118,8 @@ cmake -G Ninja -B build -S . \
 #  -DIMEX_ENABLE_L0_RUNTIME=1
 #  -DIMEX_ENABLE_SYCL_RUNTIME=1
 #  -DIMEX_ENABLE_OPENCL_RUNTIME=1
+# Additional if OpenCL library is not found by CMake
+#  -DOpenCL_LIBRARY=/PATH_TO/libOpenCL.so.1  ## usually at /usr/lib/x86_64-linux-gnu/libOpenCL.so.1
 # Additional if using a non system wide Level Zero Loader built from source
 #  -DLEVEL_ZERO_DIR=/PATH_TO/level-zero-install
 
@@ -130,6 +141,8 @@ cmake -G Ninja -B build -S . \
 #  -DIMEX_ENABLE_L0_RUNTIME=1
 #  -DIMEX_ENABLE_SYCL_RUNTIME=1
 #  -DIMEX_ENABLE_OPENCL_RUNTIME=1
+# Additional if OpenCL library is not found by CMake
+#  -DOpenCL_LIBRARY=/PATH_TO/libOpenCL.so.1  ## usually at /usr/lib/x86_64-linux-gnu/libOpenCL.so.1
 # Additional if using a non system wide Level Zero Loader built from source
 #  -DLEVEL_ZERO_DIR=/PATH_TO/level-zero-install
 
diff --git a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
index 0983a8547..1427d6429 100644
--- a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
+++ b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
@@ -1,4 +1,4 @@
-// Copyright 2022 Intel Corporation
+// Copyright 2024 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

From 468012e20f649b173acfc7aa249672d682756d4a Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Tue, 2 Jul 2024 03:52:19 +0000
Subject: [PATCH 09/10] cleanup headers

---
 .../OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp     | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
index 1427d6429..e08a456f4 100644
--- a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
+++ b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
@@ -12,25 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
 #include <array>
 #include <atomic>
 #include <cassert>
-#include <cfloat>
 #include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <stdexcept>
-#include <tuple>
-#include <vector>
-
-#include <CL/cl.h>
-#include <CL/cl_ext.h>
-#include <atomic>
-#include <map>
 #include <mutex>
-#include <thread>
+#include <vector>
 
 #ifdef _WIN32
 #define SYCL_RUNTIME_EXPORT __declspec(dllexport)

From 9084a59579e7389d481f20cf8fb01d86733b9ad8 Mon Sep 17 00:00:00 2001
From: "Mei, Yijie" <yijie.mei@intel.com>
Date: Thu, 11 Jul 2024 04:03:24 +0000
Subject: [PATCH 10/10] fix OCL_RUNTIME_EXPORT

---
 .../OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp   | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
index e08a456f4..5347bbebb 100644
--- a/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
+++ b/lib/ExecutionEngine/OPENCLRUNTIME/OpenCLRuntimeWrappers.cpp
@@ -22,9 +22,9 @@
 #include <vector>
 
 #ifdef _WIN32
-#define SYCL_RUNTIME_EXPORT __declspec(dllexport)
+#define OCL_RUNTIME_EXPORT __declspec(dllexport)
 #else
-#define SYCL_RUNTIME_EXPORT
+#define OCL_RUNTIME_EXPORT
 #endif // _WIN32
 
 namespace {
@@ -364,8 +364,8 @@ static GPUCLQUEUE *getDefaultQueue() {
 
 // Wrappers
 
-extern "C" SYCL_RUNTIME_EXPORT GPUCLQUEUE *gpuCreateStream(void *device,
-                                                           void *context) {
+extern "C" OCL_RUNTIME_EXPORT GPUCLQUEUE *gpuCreateStream(void *device,
+                                                          void *context) {
   // todo: this is a workaround of issue of gpux generating multiple streams
   if (!device && !context) {
     return getDefaultQueue();
@@ -374,13 +374,13 @@ extern "C" SYCL_RUNTIME_EXPORT GPUCLQUEUE *gpuCreateStream(void *device,
                         reinterpret_cast<cl_context>(context), nullptr);
 }
 
-extern "C" SYCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUCLQUEUE *queue) {
+extern "C" OCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUCLQUEUE *queue) {
   // todo: this is a workaround of issue of gpux generating multiple streams
   // should uncomment the below line to release the queue
   // delete queue;
 }
 
-extern "C" SYCL_RUNTIME_EXPORT void *
+extern "C" OCL_RUNTIME_EXPORT void *
 gpuMemAlloc(GPUCLQUEUE *queue, size_t size, size_t alignment, bool isShared) {
   if (queue) {
     return allocDeviceMemory(queue, size, alignment, isShared);
@@ -388,13 +388,13 @@ gpuMemAlloc(GPUCLQUEUE *queue, size_t size, size_t alignment, bool isShared) {
   return nullptr;
 }
 
-extern "C" SYCL_RUNTIME_EXPORT void gpuMemFree(GPUCLQUEUE *queue, void *ptr) {
+extern "C" OCL_RUNTIME_EXPORT void gpuMemFree(GPUCLQUEUE *queue, void *ptr) {
   if (queue && ptr) {
     deallocDeviceMemory(queue, ptr);
   }
 }
 
-extern "C" SYCL_RUNTIME_EXPORT cl_program
+extern "C" OCL_RUNTIME_EXPORT cl_program
 gpuModuleLoad(GPUCLQUEUE *queue, const unsigned char *data, size_t dataSize) {
   if (queue) {
     return loadModule(queue, data, dataSize);
@@ -402,16 +402,16 @@ gpuModuleLoad(GPUCLQUEUE *queue, const unsigned char *data, size_t dataSize) {
   return nullptr;
 }
 
-extern "C" SYCL_RUNTIME_EXPORT cl_kernel gpuKernelGet(GPUCLQUEUE *queue,
-                                                      cl_program module,
-                                                      const char *name) {
+extern "C" OCL_RUNTIME_EXPORT cl_kernel gpuKernelGet(GPUCLQUEUE *queue,
+                                                     cl_program module,
+                                                     const char *name) {
   if (queue) {
     return getKernel(queue, module, name);
   }
   return nullptr;
 }
 
-extern "C" SYCL_RUNTIME_EXPORT void
+extern "C" OCL_RUNTIME_EXPORT void
 gpuLaunchKernel(GPUCLQUEUE *queue, cl_kernel kernel, size_t gridX, size_t gridY,
                 size_t gridZ, size_t blockX, size_t blockY, size_t blockZ,
                 size_t sharedMemBytes, void *params) {
@@ -421,7 +421,7 @@ gpuLaunchKernel(GPUCLQUEUE *queue, cl_kernel kernel, size_t gridX, size_t gridY,
   }
 }
 
-extern "C" SYCL_RUNTIME_EXPORT void gpuWait(GPUCLQUEUE *queue) {
+extern "C" OCL_RUNTIME_EXPORT void gpuWait(GPUCLQUEUE *queue) {
   if (queue) {
     CL_SAFE_CALL(clFinish(queue->queue_));
   }