diff --git a/setup.py b/setup.py index 51ca5e2abecf7..4a20e49235ac8 100644 --- a/setup.py +++ b/setup.py @@ -382,8 +382,7 @@ def get_gaudi_sw_version(): output = subprocess.run("hl-smi", shell=True, text=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + capture_output=True, env={"ENABLE_CONSOLE": "true"}) if output.returncode == 0 and output.stdout: return output.stdout.split("\n")[2].replace( diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py index 28d1882cb0db7..a24bab6df370e 100644 --- a/vllm/executor/ray_hpu_executor.py +++ b/vllm/executor/ray_hpu_executor.py @@ -34,7 +34,7 @@ class RayHPUExecutor(DistributedGPUExecutor): uses_ray: bool = True def _init_executor(self) -> None: - self.forward_dag: Optional["ray.dag.CompiledDAG"] = None + self.forward_dag: Optional[ray.dag.CompiledDAG] = None # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 5008a2abd22ea..7e9b2bd13b48a 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -67,8 +67,7 @@ class Singleton(type): def __call__(cls, *args, **kwargs): if cls not in cls._instances: - cls._instances[cls] = super(Singleton, - cls).__call__(*args, **kwargs) + cls._instances[cls] = super().__call__(*args, **kwargs) return cls._instances[cls] @@ -273,7 +272,7 @@ def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt): return indices, offsets -class HpuModelAdapter(): +class HpuModelAdapter: def __init__(self, model, block_size, dtype, enforce_eager): self.model = model @@ -1643,7 +1642,7 @@ def _maybe_wrap_in_hpu_graph(*args, **kwargs): ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs) -class HabanaProfilerCounterHelper(): +class HabanaProfilerCounterHelper: def __init__(self): self.niter = 0