Skip to content

Commit

Permalink
Fix all replicas had the same _llm_identifier for `CudaDevicePlacem…
Browse files Browse the repository at this point in the history
…entMixin` (#941)

* Fix CUDA device placement with multiple replicas

* Print replica id

* Copy `step` for each replica
  • Loading branch information
gabrielmbmb authored Sep 2, 2024
1 parent c8f4d61 commit 56b4036
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 8 deletions.
5 changes: 0 additions & 5 deletions src/distilabel/llms/mixins/cuda_device_placement.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,11 +207,6 @@ def _get_cuda_device(self, device_map: Dict[str, List[int]]) -> Union[int, None]
return device

return None
raise RuntimeError(
"Couldn't find an available CUDA device automatically to be used by the LLM"
f" '{self._llm_identifier}'. For forcing the use of a specific device, set the"
" `cuda_devices` attribute to a list with the desired device(s)."
)

def _set_cuda_visible_devices(self) -> None:
"""Sets the `CUDA_VISIBLE_DEVICES` environment variable to the list of CUDA devices
Expand Down
10 changes: 8 additions & 2 deletions src/distilabel/pipeline/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1222,8 +1222,14 @@ def _run_steps(self, steps: Iterable[str]) -> None:

step_num_replicas: int = step.resources.replicas if step.is_normal else 1 # type: ignore
for replica in range(step_num_replicas):
self._logger.debug(f"Running 1 replica of step '{step.name}'...")
self._run_step(step=step, input_queue=input_queue, replica=replica)
self._logger.debug(
f"Running 1 replica of step '{step.name}' with ID {replica}..."
)
self._run_step(
step=step.model_copy(deep=True),
input_queue=input_queue,
replica=replica,
)

def _add_batches_back_to_batch_manager(self) -> None:
"""Add the `Batch`es that were sent to a `Step` back to the `_BatchManager`. This
Expand Down
2 changes: 1 addition & 1 deletion src/distilabel/pipeline/step_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def _init_cuda_device_placement_mixin(attr: CudaDevicePlacementMixin) -> None:
attr.disable_cuda_device_placement = True
else:
desired_num_gpus = self.step.resources.gpus or 1
attr._llm_identifier = self.step.name
attr._llm_identifier = f"{self.step.name}-replica-{self.replica}"
attr._desired_num_gpus = desired_num_gpus

for field_name in self.step.model_fields_set:
Expand Down

0 comments on commit 56b4036

Please sign in to comment.