Merge branch 'main' into develop

argilla-io · Aug 23, 2024 · def7060 · def7060
2 parents 22db32c + ed88585
commit def7060
Show file tree

Hide file tree

Showing 81 changed files with 348 additions and 77 deletions.
diff --git a/.github/workflows/docs-pr-close.yml b/.github/workflows/docs-pr-close.yml
@@ -4,7 +4,9 @@ on:
   pull_request:
     types: [closed]
 
-concurrency: distilabel-docs
+concurrency:
+  group: distilabel-docs
+  cancel-in-progress: false
 
 jobs:
   cleanup:

diff --git a/.github/workflows/docs-pr.yml b/.github/workflows/docs-pr.yml
@@ -6,7 +6,9 @@ on:
       - opened
       - synchronize
 
-concurrency: distilabel-docs
+concurrency:
+  group: distilabel-docs
+  cancel-in-progress: false
 
 jobs:
   publish:

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -8,7 +8,9 @@ on:
     tags:
       - "**"
 
-concurrency: distilabel-docs
+concurrency:
+  group: distilabel-docs
+  cancel-in-progress: false
 
 jobs:
   publish:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
           - --fuzzy-match-generates-todo
 
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.4.5
+    rev: v0.6.2
     hooks:
       - id: ruff
         args: [--fix]

diff --git a/Makefile b/Makefile
@@ -2,11 +2,13 @@ sources = src/distilabel tests
 
 .PHONY: format
 format:
+	ruff --version
 	ruff check --fix $(sources)
 	ruff format $(sources)
 
 .PHONY: lint
 lint:
+	ruff --version
 	ruff check $(sources)
 	ruff format --check $(sources)
 

diff --git a/docs/sections/how_to_guides/advanced/scaling_with_ray.md b/docs/sections/how_to_guides/advanced/scaling_with_ray.md
@@ -179,13 +179,19 @@ ip_head=$head_node_ip:$port
 export ip_head
 echo "IP Head: $ip_head"
 
+# Generate a unique Ray tmp dir for the head node (just in case the default one is not writable)
+head_tmp_dir="/tmp/ray_tmp_${SLURM_JOB_ID}_head"
+
 echo "Starting HEAD at $head_node"
-srun --nodes=1 --ntasks=1 -w "$head_node" \
+OUTLINES_CACHE_DIR="/tmp/.outlines" srun --nodes=1 --ntasks=1 -w "$head_node" \ # (4)
     ray start --head --node-ip-address="$head_node_ip" --port=$port \
     --dashboard-host=0.0.0.0 \
+    --dashboard-port=8265 \
+    --temp-dir="$head_tmp_dir" \
     --block &
 
 # Give some time to head node to start...
+echo "Waiting a bit before starting worker nodes..."
 sleep 10
 
 # Start Ray worker nodes
@@ -194,20 +200,27 @@ worker_num=$((SLURM_JOB_NUM_NODES - 1))
 # Start from 1 (0 is head node)
 for ((i = 1; i <= worker_num; i++)); do
     node_i=${nodes_array[$i]}
+    worker_tmp_dir="/tmp/ray_tmp_${SLURM_JOB_ID}_worker_$i"
     echo "Starting WORKER $i at $node_i"
-    srun --nodes=1 --ntasks=1 -w "$node_i" \
+    OUTLINES_CACHE_DIR="/tmp/.outlines" srun --nodes=1 --ntasks=1 -w "$node_i" \
         ray start --address "$ip_head" \
+        --temp-dir="$worker_tmp_dir" \
         --block &
     sleep 5
 done
 
+# Give some time to the Ray cluster to gather info
+echo "Waiting a bit before submitting the job..."
+sleep 60
+
 # Finally submit the job to the cluster
 ray job submit --address http://localhost:8265 --working-dir ray-pipeline -- python -u pipeline.py
 ```
 
-1. In this case, we just want two nodes: one to run the Ray head node and one to run a worker. 
+1. In this case, we just want two nodes: one to run the Ray head node and one to run a worker.
 2. We just want to run a task per node i.e. the Ray command that starts the head/worker node.
 3. We have selected 1 GPU per node, but we could have selected more depending on the pipeline.
+4. We need to set the environment variable `OUTLINES_CACHE_DIR` to `/tmp/.outlines` to avoid issues with the nodes trying to read/write the same `outlines` cache files, which is not possible.
 
 ## `vLLM` and `tensor_parallel_size`
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -48,7 +48,7 @@ distilabel = "distilabel.cli.app:app"
 "distilabel/components-gallery" = "distilabel.utils.mkdocs.components_gallery:ComponentsGalleryPlugin"
 
 [project.optional-dependencies]
-dev = ["ruff == 0.4.5", "pre-commit >= 3.5.0"]
+dev = ["ruff == 0.6.2", "pre-commit >= 3.5.0"]
 docs = [
     "mkdocs-material >=9.5.17",
     "mkdocstrings[python] >= 0.24.0",

diff --git a/scripts/install_cpu_vllm.sh b/scripts/install_cpu_vllm.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -e
+
+echo "Updating system and installing build dependencies..."
+sudo apt-get update -y
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev cmake
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+echo "Python version:"
+python --version
+
+echo "Python executable location:"
+which python
+
+echo "Installing Python build dependencies..."
+python -m pip install --upgrade pip
+python -m pip install wheel packaging ninja "setuptools>=49.4.0" numpy
+
+echo "Cloning 'vllm-project/vllm' GitHub repository..."
+git clone https://github.com/vllm-project/vllm.git
+cd vllm || exit
+
+git fetch --tags
+latest_tag=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+
+echo "Checking out to '$latest_tag' tag..."
+git checkout "$latest_tag"
+
+echo "Installing vLLM CPU requirements..."
+python -m pip install -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+
+echo "Installing vLLM for CPU..."
+export CMAKE_ARGS="-DPYTHON_EXECUTABLE=$(which python) -DPYTHON_INCLUDE_DIR=$(python -c "from sysconfig import get_path; print(get_path('include'))") -DPYTHON_LIBRARY=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))")"
+echo "CMake args: $CMAKE_ARGS"
+VLLM_TARGET_DEVICE=cpu python setup.py install
+
+echo "Installation complete!"
diff --git a/scripts/install_dependencies.sh b/scripts/install_dependencies.sh
@@ -6,11 +6,13 @@ python_version=$(python -c "import sys; print(sys.version_info[:2])")
 
 python -m pip install uv
 
-uv pip install --system -e ".[dev,tests,anthropic,argilla,cohere,groq,hf-inference-endpoints,hf-transformers,litellm,ollama,openai,outlines,vertexai,mistralai,instructor,sentence-transformers,faiss-cpu]"
-pip install llama-cpp-python
+uv pip install --system -e ".[anthropic,argilla,cohere,groq,hf-inference-endpoints,hf-transformers,litellm,llama-cpp,ollama,openai,outlines,vertexai,mistralai,instructor,sentence-transformers,faiss-cpu]"
 
 if [ "${python_version}" != "(3, 12)" ]; then
-	uv pip install --system -e .[ray]
+  uv pip install --system -e .[ray]
 fi
 
+./scripts/install_cpu_vllm.sh
 uv pip install --system git+https://github.com/argilla-io/LLM-Blender.git
+
+uv pip install --system -e ".[dev,tests]"
diff --git a/src/distilabel/llms/huggingface/inference_endpoints.py b/src/distilabel/llms/huggingface/inference_endpoints.py
@@ -281,7 +281,7 @@ def load(self) -> None:  # noqa: C901
             self._model_name = client.repository
 
         self._aclient = AsyncInferenceClient(
-            model=self.base_url,
+            base_url=self.base_url,
             token=self.api_key.get_secret_value(),
         )
 

diff --git a/src/distilabel/llms/vllm.py b/src/distilabel/llms/vllm.py
@@ -38,16 +38,13 @@
 from distilabel.steps.tasks.typing import FormattedInput, OutlinesStructuredOutputType
 
 if TYPE_CHECKING:
-    from openai import OpenAI
+    from openai import OpenAI  # noqa
     from transformers import PreTrainedTokenizer
     from vllm import LLM as _vLLM
 
     from distilabel.steps.tasks.typing import StandardInput
 
 
-SamplingParams = None
-
-
 class vLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):
     """`vLLM` library LLM implementation.
 
@@ -177,10 +174,6 @@ def load(self) -> None:
 
         try:
             from vllm import LLM as _vLLM
-            from vllm import SamplingParams as _SamplingParams
-
-            global SamplingParams
-            SamplingParams = _SamplingParams
         except ImportError as ie:
             raise ImportError(
                 "vLLM is not installed. Please install it using `pip install vllm`."
@@ -319,6 +312,7 @@ def generate(  # type: ignore
         Returns:
             A list of lists of strings containing the generated responses for each input.
         """
+        from vllm import SamplingParams
 
         if extra_sampling_params is None:
             extra_sampling_params = {}

diff --git a/src/distilabel/pipeline/ray.py b/src/distilabel/pipeline/ray.py
@@ -175,14 +175,27 @@ def _init_ray(self) -> None:
                 runtime_env={"pip": self.requirements},
                 **self._ray_init_kwargs,
             )
-        else:
+        elif not ray.is_initialized():
+            # Init a local Ray cluster
             ray.init(**self._ray_init_kwargs)
 
-        # Get the number of GPUs per Ray node
+        self._ray_node_ids = self._get_ray_gpus_per_node()
+
+    def _get_ray_gpus_per_node(self) -> Dict[str, int]:
+        """Gets the number of GPUs per node in the Ray cluster.
+
+        Returns:
+            A dictionary in which the keys are the node IDs and the values the number of
+                GPUs per node.
+        """
+        import ray
+
+        gpus_per_node = {}
         for node in ray.nodes():
             node_id = node["NodeID"]
             gpus = int(node["Resources"].get("GPU", 0))
-            self._ray_node_ids[node_id] = gpus
+            gpus_per_node[node_id] = gpus
+        return gpus_per_node
 
     @property
     def QueueClass(self) -> Callable:
@@ -316,7 +329,7 @@ def _create_vllm_placement_group(
         selected_node_id = None
         gpus_left_needed = total_gpus_needed
         for node_id in self._ray_node_ids:
-            gpus_to_allocate = min(self._ray_node_ids[node_id], total_gpus_needed)
+            gpus_to_allocate = min(self._ray_node_ids[node_id], gpus_left_needed)
             self._ray_node_ids[node_id] -= gpus_to_allocate
             gpus_left_needed -= gpus_to_allocate
             if gpus_left_needed == 0:

diff --git a/src/distilabel/steps/tasks/structured_outputs/utils.py b/src/distilabel/steps/tasks/structured_outputs/utils.py
@@ -18,13 +18,15 @@
 from pydantic import BaseModel, Field, create_model
 
 
-def schema_as_dict(schema: Union[str, Type[BaseModel]]) -> Dict[str, Any]:
+def schema_as_dict(
+    schema: Union[str, Type[BaseModel], Dict[str, Any]],
+) -> Dict[str, Any]:
     """Helper function to obtain the schema and simplify serialization."""
-    if type(schema) == type(BaseModel):
+    if type(schema) is type(BaseModel):
         return schema.model_json_schema()
     elif isinstance(schema, str):
         return json.loads(schema)
-    return schema
+    return schema  # type: ignore
 
 
 # NOTE: The following functions were copied from:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,40 @@
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from typing import TYPE_CHECKING, List
+
+import pytest
+
+if TYPE_CHECKING:
+    from _pytest.config import Config
+    from _pytest.nodes import Item
+
+
+def pytest_configure(config: "Config") -> None:
+    config.addinivalue_line(
+        "markers",
+        "skip_python_versions(versions): mark test to be skipped on specified Python versions",
+    )
+
+
+def pytest_collection_modifyitems(config: "Config", items: List["Item"]) -> None:
+    current_version = f"{sys.version_info.major}.{sys.version_info.minor}"
+    for item in items:
+        skip_versions_marker = item.get_closest_marker("skip_python_versions")
+        if skip_versions_marker:
+            versions_to_skip = skip_versions_marker.args[0]
+            if current_version in versions_to_skip:
+                skip_reason = f"Test not supported on Python {current_version}"
+                item.add_marker(pytest.mark.skip(reason=skip_reason))
diff --git a/tests/integration/test_cache.py b/tests/integration/test_cache.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import pytest
+
 from distilabel.pipeline import Pipeline
 from distilabel.steps import GeneratorStep, StepInput, step
 

diff --git a/tests/integration/test_dataset_without_step.py b/tests/integration/test_dataset_without_step.py
@@ -17,6 +17,7 @@
 import pandas as pd
 import pytest
 from datasets import Dataset
+
 from distilabel.pipeline import Pipeline
 from distilabel.steps import make_generator_step
 from distilabel.steps.base import Step, StepInput

diff --git a/tests/integration/test_ray_pipeline.py b/tests/integration/test_ray_pipeline.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 from typing import TYPE_CHECKING, Dict, List
 
 import pytest
+
 from distilabel.mixins.runtime_parameters import RuntimeParameter
 from distilabel.pipeline.ray import RayPipeline
 from distilabel.steps.base import Step, StepInput
@@ -148,9 +148,7 @@ def outputs(self) -> List[str]:
         return ["response"]
 
 
-@pytest.mark.skipif(
-    sys.version_info >= (3, 12), reason="`ray` is not compatible with `python>=3.12`"
-)
+@pytest.mark.skip_python_versions(["3.12"])
 def test_run_pipeline() -> None:
     import ray
     from ray.cluster_utils import Cluster

diff --git a/tests/integration/test_routing_batch_function.py b/tests/integration/test_routing_batch_function.py
@@ -17,6 +17,7 @@
 from typing import TYPE_CHECKING, List
 
 import pytest
+
 from distilabel.pipeline import Pipeline, routing_batch_function
 from distilabel.steps import LoadDataFromDicts, StepInput, step
 

diff --git a/tests/integration/test_using_fs_to_pass_data.py b/tests/integration/test_using_fs_to_pass_data.py
@@ -15,6 +15,7 @@
 from typing import TYPE_CHECKING, List
 
 import numpy as np
+
 from distilabel.pipeline import Pipeline
 from distilabel.steps import GeneratorStep, StepInput, step
 

diff --git a/tests/unit/cli/pipeline/test_app.py b/tests/unit/cli/pipeline/test_app.py
@@ -14,9 +14,9 @@
 
 from unittest import mock
 
-from distilabel.cli.app import app
 from typer.testing import CliRunner
 
+from distilabel.cli.app import app
 from tests.unit.cli.utils import TEST_PIPELINE_PATH
 
 runner = CliRunner()

diff --git a/tests/unit/cli/pipeline/utils.py b/tests/unit/cli/pipeline/utils.py
@@ -15,6 +15,7 @@
 from unittest import mock
 
 import pytest
+
 from distilabel.cli.pipeline.utils import (
     get_config_from_url,
     parse_runtime_parameters,

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
@@ -15,6 +15,7 @@
 from typing import TYPE_CHECKING, Any, List
 
 import pytest
+
 from distilabel.llms.base import LLM, AsyncLLM
 from distilabel.llms.mixins.magpie import MagpieChatTemplateMixin