Skip to content

Commit

Permalink
Remove unused methods and members from Cuda- and RocmExecutor
Browse files Browse the repository at this point in the history
- Remove LoadModuleFromHsaco from CudaExecutor (only relevant for Rocm)
- Remove state management variables from CudaExecutor (only used by Rocm)
- Replace map types by flat_hash_map

PiperOrigin-RevId: 685884275
  • Loading branch information
beckerhe authored and Google-ML-Automation committed Oct 15, 2024
1 parent a009b7b commit 7b115e3
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 52 deletions.
6 changes: 0 additions & 6 deletions xla/stream_executor/cuda/cuda_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -653,12 +653,6 @@ absl::Status CudaExecutor::LoadModuleFromPtx(const char* ptx,
return absl::OkStatus();
}

absl::Status CudaExecutor::LoadModuleFromHsaco(const char* hsaco,
CUmodule* module) {
return absl::InternalError(
"Feature not supported on CUDA platform (LoadModuleFromHsaco)");
}

absl::StatusOr<std::unique_ptr<Kernel>> CudaExecutor::LoadKernel(
const MultiKernelLoaderSpec& spec) {
auto cuda_kernel = std::make_unique<CudaKernel>(this);
Expand Down
27 changes: 4 additions & 23 deletions xla/stream_executor/cuda/cuda_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ limitations under the License.
#include <memory>
#include <optional>
#include <string>
#include <unordered_map>
#include <utility>
#include <variant>

Expand All @@ -42,10 +41,8 @@ limitations under the License.
#include "xla/stream_executor/event.h"
#include "xla/stream_executor/event_based_timer.h"
#include "xla/stream_executor/fft.h"
#include "xla/stream_executor/gpu/gpu_driver.h"
#include "xla/stream_executor/gpu/gpu_executor.h"
#include "xla/stream_executor/gpu/gpu_kernel.h"
#include "xla/stream_executor/gpu/gpu_types.h"
#include "xla/stream_executor/kernel.h"
#include "xla/stream_executor/kernel_spec.h"
#include "xla/stream_executor/memory_allocation.h"
Expand Down Expand Up @@ -154,32 +151,15 @@ class CudaExecutor : public GpuExecutor {
absl::Status LoadModuleFromPtx(const char* ptx, CUmodule* module)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);

// (supported on ROCm only)
absl::Status LoadModuleFromHsaco(const char* hsaco, CUmodule* module)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);

bool UnloadGpuBinary(const void* gpu_binary)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);

// Returns true if a delay kernel is supported.
absl::StatusOr<bool> DelayKernelIsSupported();

// Guards the on-disk-module mapping.
absl::Mutex disk_modules_mu_;

// Mapping from filename to CUmodule, if it was already retrieved.
// Multiple CUfunctions are usually obtained from a single
// CUmodule so we attempt to hit in this mapping first, before
// retrieving it.
std::map<std::string, CUmodule> disk_modules_
ABSL_GUARDED_BY(disk_modules_mu_);

// Guards the in-memory-module mapping.
absl::Mutex in_memory_modules_mu_;

std::map<const char*, CUmodule> in_memory_modules_
ABSL_GUARDED_BY(in_memory_modules_mu_);

absl::Mutex shared_constants_mu_;
// On-device constants that can be shared between multiple executables. A
// pointer for a given constant will expire when no executables require use
Expand All @@ -188,10 +168,11 @@ class CudaExecutor : public GpuExecutor {
shared_constants_ ABSL_GUARDED_BY(shared_constants_mu_);

// Kernel -> loaded GPU binary. Many kernels may load the same binary.
std::unordered_map<const Kernel*, const void*> kernel_to_gpu_binary_
absl::flat_hash_map<const Kernel*, const void*> kernel_to_gpu_binary_
ABSL_GUARDED_BY(in_memory_modules_mu_);
// GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
std::unordered_map<const void*, std::pair<CUmodule, uint64_t>>

// GPU binary (PTX or CUBIN) -> {CUDA module, reference count}.
absl::flat_hash_map<const void*, std::pair<CUmodule, uint64_t>>
gpu_binary_to_module_ ABSL_GUARDED_BY(in_memory_modules_mu_);

// Handle for the CUDA device being operated on. Immutable
Expand Down
7 changes: 1 addition & 6 deletions xla/stream_executor/rocm/rocm_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,7 @@ absl::Status LoadHsaco(Context* context, const char* hsaco_contents,
GetDriverExecutor()->Schedule(
[context, hsaco_contents, module, &returned_status, &notification]() {
ScopedActivateContext activation{context};
void* hsaco_data = const_cast<char*>(hsaco_contents);

hipError_t res = wrap::hipModuleLoadData(module, hsaco_data);
hipError_t res = wrap::hipModuleLoadData(module, hsaco_contents);

if (res != hipSuccess) {
returned_status = absl::InternalError(
Expand Down Expand Up @@ -482,9 +480,6 @@ void* HostAllocate(Context* context, uint64_t bytes) {
} // namespace

RocmExecutor::~RocmExecutor() {
for (auto& it : disk_modules_) {
UnloadRocmModule(gpu_context(), it.second);
}
for (auto& it : in_memory_modules_) {
UnloadRocmModule(gpu_context(), it.second);
}
Expand Down
23 changes: 6 additions & 17 deletions xla/stream_executor/rocm/rocm_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ limitations under the License.
#include <memory>
#include <optional>
#include <string>
#include <unordered_map>
#include <utility>
#include <variant>

Expand All @@ -32,6 +31,7 @@ limitations under the License.
#include "absl/status/statusor.h"
#include "absl/synchronization/mutex.h"
#include "absl/types/span.h"
#include "rocm/include/hip/hip_runtime.h"
#include "xla/stream_executor/blas.h"
#include "xla/stream_executor/command_buffer.h"
#include "xla/stream_executor/device_description.h"
Expand All @@ -40,10 +40,8 @@ limitations under the License.
#include "xla/stream_executor/event.h"
#include "xla/stream_executor/event_based_timer.h"
#include "xla/stream_executor/fft.h"
#include "xla/stream_executor/gpu/gpu_driver.h"
#include "xla/stream_executor/gpu/gpu_executor.h"
#include "xla/stream_executor/gpu/gpu_kernel.h"
#include "xla/stream_executor/gpu/gpu_types.h"
#include "xla/stream_executor/kernel.h"
#include "xla/stream_executor/kernel_spec.h"
#include "xla/stream_executor/memory_allocation.h"
Expand Down Expand Up @@ -144,20 +142,10 @@ class RocmExecutor : public GpuExecutor {
// Creates a GpuEvent for the given stream.
absl::StatusOr<std::unique_ptr<RocmEvent>> CreateGpuEvent(bool allow_timing);

// Guards the on-disk-module mapping.
absl::Mutex disk_modules_mu_;

// Mapping from filename to hipModule_t, if it was already retrieved.
// Multiple hipFunction_ts are usually obtained from a single
// hipModule_t so we attempt to hit in this mapping first, before
// retrieving it.
std::map<std::string, hipModule_t> disk_modules_
ABSL_GUARDED_BY(disk_modules_mu_);

// Guards the in-memory-module mapping.
absl::Mutex in_memory_modules_mu_;

std::map<const char*, hipModule_t> in_memory_modules_
absl::flat_hash_map<const char*, hipModule_t> in_memory_modules_
ABSL_GUARDED_BY(in_memory_modules_mu_);

absl::Mutex shared_constants_mu_;
Expand All @@ -168,10 +156,11 @@ class RocmExecutor : public GpuExecutor {
shared_constants_ ABSL_GUARDED_BY(shared_constants_mu_);

// Kernel -> loaded GPU binary. Many kernels may load the same binary.
std::unordered_map<const Kernel*, const void*> kernel_to_gpu_binary_
absl::flat_hash_map<const Kernel*, const void*> kernel_to_gpu_binary_
ABSL_GUARDED_BY(in_memory_modules_mu_);
// GPU binary (PTX or CUBIN or HSACO) -> {module, reference count}.
std::unordered_map<const void*, std::pair<hipModule_t, uint64_t>>

// GPU binary HSACO -> {module, reference count}.
absl::flat_hash_map<const void*, std::pair<hipModule_t, uint64_t>>
gpu_binary_to_module_ ABSL_GUARDED_BY(in_memory_modules_mu_);

// Handle for the ROCm device being operated on. Immutable
Expand Down

0 comments on commit 7b115e3

Please sign in to comment.