Remove unused methods and members from Cuda- and RocmExecutor

- Remove LoadModuleFromHsaco from CudaExecutor (only relevant for Rocm) - Remove state management variables from CudaExecutor (only used by Rocm) - Replace map types by flat_hash_map PiperOrigin-RevId: 685884275
openxla · Oct 15, 2024 · 7b115e3 · 7b115e3
1 parent a009b7b
commit 7b115e3
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 52 deletions.
diff --git a/xla/stream_executor/cuda/cuda_executor.cc b/xla/stream_executor/cuda/cuda_executor.cc
@@ -653,12 +653,6 @@ absl::Status CudaExecutor::LoadModuleFromPtx(const char* ptx,
   return absl::OkStatus();
 }
 
-absl::Status CudaExecutor::LoadModuleFromHsaco(const char* hsaco,
-                                               CUmodule* module) {
-  return absl::InternalError(
-      "Feature not supported on CUDA platform (LoadModuleFromHsaco)");
-}
-
 absl::StatusOr<std::unique_ptr<Kernel>> CudaExecutor::LoadKernel(
     const MultiKernelLoaderSpec& spec) {
   auto cuda_kernel = std::make_unique<CudaKernel>(this);

diff --git a/xla/stream_executor/cuda/cuda_executor.h b/xla/stream_executor/cuda/cuda_executor.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <unordered_map>
 #include <utility>
 #include <variant>
 
@@ -42,10 +41,8 @@ limitations under the License.
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/event_based_timer.h"
 #include "xla/stream_executor/fft.h"
-#include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_kernel.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/memory_allocation.h"
@@ -154,32 +151,15 @@ class CudaExecutor : public GpuExecutor {
   absl::Status LoadModuleFromPtx(const char* ptx, CUmodule* module)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
-  // (supported on ROCm only)
-  absl::Status LoadModuleFromHsaco(const char* hsaco, CUmodule* module)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
   bool UnloadGpuBinary(const void* gpu_binary)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
   // Returns true if a delay kernel is supported.
   absl::StatusOr<bool> DelayKernelIsSupported();
 
-  // Guards the on-disk-module mapping.
-  absl::Mutex disk_modules_mu_;
-
-  // Mapping from filename to CUmodule, if it was already retrieved.
-  // Multiple CUfunctions are usually obtained from a single
-  // CUmodule so we attempt to hit in this mapping first, before
-  // retrieving it.
-  std::map<std::string, CUmodule> disk_modules_
-      ABSL_GUARDED_BY(disk_modules_mu_);
-
   // Guards the in-memory-module mapping.
   absl::Mutex in_memory_modules_mu_;
 
-  std::map<const char*, CUmodule> in_memory_modules_
-      ABSL_GUARDED_BY(in_memory_modules_mu_);
-
   absl::Mutex shared_constants_mu_;
   // On-device constants that can be shared between multiple executables. A
   // pointer for a given constant will expire when no executables require use
@@ -188,10 +168,11 @@ class CudaExecutor : public GpuExecutor {
       shared_constants_ ABSL_GUARDED_BY(shared_constants_mu_);
 
   // Kernel -> loaded GPU binary. Many kernels may load the same binary.
-  std::unordered_map<const Kernel*, const void*> kernel_to_gpu_binary_
+  absl::flat_hash_map<const Kernel*, const void*> kernel_to_gpu_binary_
       ABSL_GUARDED_BY(in_memory_modules_mu_);
-  // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
-  std::unordered_map<const void*, std::pair<CUmodule, uint64_t>>
+
+  // GPU binary (PTX or CUBIN) -> {CUDA module, reference count}.
+  absl::flat_hash_map<const void*, std::pair<CUmodule, uint64_t>>
       gpu_binary_to_module_ ABSL_GUARDED_BY(in_memory_modules_mu_);
 
   // Handle for the CUDA device being operated on. Immutable

diff --git a/xla/stream_executor/rocm/rocm_executor.cc b/xla/stream_executor/rocm/rocm_executor.cc
@@ -165,9 +165,7 @@ absl::Status LoadHsaco(Context* context, const char* hsaco_contents,
   GetDriverExecutor()->Schedule(
       [context, hsaco_contents, module, &returned_status, &notification]() {
         ScopedActivateContext activation{context};
-        void* hsaco_data = const_cast<char*>(hsaco_contents);
-
-        hipError_t res = wrap::hipModuleLoadData(module, hsaco_data);
+        hipError_t res = wrap::hipModuleLoadData(module, hsaco_contents);
 
         if (res != hipSuccess) {
           returned_status = absl::InternalError(
@@ -482,9 +480,6 @@ void* HostAllocate(Context* context, uint64_t bytes) {
 }  // namespace
 
 RocmExecutor::~RocmExecutor() {
-  for (auto& it : disk_modules_) {
-    UnloadRocmModule(gpu_context(), it.second);
-  }
   for (auto& it : in_memory_modules_) {
     UnloadRocmModule(gpu_context(), it.second);
   }

diff --git a/xla/stream_executor/rocm/rocm_executor.h b/xla/stream_executor/rocm/rocm_executor.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <unordered_map>
 #include <utility>
 #include <variant>
 
@@ -32,6 +31,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "rocm/include/hip/hip_runtime.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_description.h"
@@ -40,10 +40,8 @@ limitations under the License.
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/event_based_timer.h"
 #include "xla/stream_executor/fft.h"
-#include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_kernel.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/memory_allocation.h"
@@ -144,20 +142,10 @@ class RocmExecutor : public GpuExecutor {
   // Creates a GpuEvent for the given stream.
   absl::StatusOr<std::unique_ptr<RocmEvent>> CreateGpuEvent(bool allow_timing);
 
-  // Guards the on-disk-module mapping.
-  absl::Mutex disk_modules_mu_;
-
-  // Mapping from filename to hipModule_t, if it was already retrieved.
-  // Multiple hipFunction_ts are usually obtained from a single
-  // hipModule_t so we attempt to hit in this mapping first, before
-  // retrieving it.
-  std::map<std::string, hipModule_t> disk_modules_
-      ABSL_GUARDED_BY(disk_modules_mu_);
-
   // Guards the in-memory-module mapping.
   absl::Mutex in_memory_modules_mu_;
 
-  std::map<const char*, hipModule_t> in_memory_modules_
+  absl::flat_hash_map<const char*, hipModule_t> in_memory_modules_
       ABSL_GUARDED_BY(in_memory_modules_mu_);
 
   absl::Mutex shared_constants_mu_;
@@ -168,10 +156,11 @@ class RocmExecutor : public GpuExecutor {
       shared_constants_ ABSL_GUARDED_BY(shared_constants_mu_);
 
   // Kernel -> loaded GPU binary. Many kernels may load the same binary.
-  std::unordered_map<const Kernel*, const void*> kernel_to_gpu_binary_
+  absl::flat_hash_map<const Kernel*, const void*> kernel_to_gpu_binary_
       ABSL_GUARDED_BY(in_memory_modules_mu_);
-  // GPU binary (PTX or CUBIN or HSACO) -> {module, reference count}.
-  std::unordered_map<const void*, std::pair<hipModule_t, uint64_t>>
+
+  // GPU binary HSACO -> {module, reference count}.
+  absl::flat_hash_map<const void*, std::pair<hipModule_t, uint64_t>>
       gpu_binary_to_module_ ABSL_GUARDED_BY(in_memory_modules_mu_);
 
   // Handle for the ROCm device being operated on. Immutable