argilla-io · plaguss · Jun 3, 2024 · May 29, 2024 · May 29, 2024 · May 29, 2024
diff --git a/docs/sections/pipeline_samples/examples/index.md b/docs/sections/pipeline_samples/examples/index.md
@@ -10,7 +10,7 @@ Generate RPG characters following a `pydantic.BaseModel` with `outlines` in `dis
 
     This script makes use of [`LlamaCppLLM`][distilabel.llms.llamacpp.LlamaCppLLM] and the structured output capabilities thanks to [`outlines`](https://outlines-dev.github.io/outlines/welcome/) to generate RPG characters that adhere to a JSON schema.
 
-    It makes use of a local model which can be downlaoded using curl (explained in the script itself), and can be exchanged with other `LLMs` like [`vLLM`][distilabel.llms.vllm.vLLM].
+    It makes use of a local model which can be downloaded using curl (explained in the script itself), and can be exchanged with other `LLMs` like [`vLLM`][distilabel.llms.vllm.vLLM].
 
     ??? Run
 

diff --git a/src/distilabel/llms/anthropic.py b/src/distilabel/llms/anthropic.py
@@ -33,7 +33,7 @@
 from distilabel.llms.base import AsyncLLM
 from distilabel.llms.typing import GenerateOutput
 from distilabel.mixins.runtime_parameters import RuntimeParameter
-from distilabel.steps.tasks.typing import ChatType
+from distilabel.steps.tasks.typing import StandardInput
 from distilabel.utils.itertools import grouper
 
 if TYPE_CHECKING:
@@ -163,7 +163,7 @@ def model_name(self) -> str:
     @validate_call
     async def agenerate(  # type: ignore
         self,
-        input: ChatType,
+        input: StandardInput,
         max_tokens: int = 128,
         stop_sequences: Union[List[str], None] = None,
         temperature: float = 1.0,
@@ -223,7 +223,7 @@ async def agenerate(  # type: ignore
     @override
     def generate(
         self,
-        inputs: List["ChatType"],
+        inputs: List["StandardInput"],
         num_generations: int = 1,
         **kwargs: Any,
     ) -> List["GenerateOutput"]:
@@ -232,7 +232,7 @@ def generate(
         """
 
         async def agenerate(
-            inputs: List["ChatType"], **kwargs: Any
+            inputs: List["StandardInput"], **kwargs: Any
         ) -> "GenerateOutput":
             """Internal function to parallelize the asynchronous generation of responses."""
             tasks = [

diff --git a/src/distilabel/llms/base.py b/src/distilabel/llms/base.py
@@ -37,7 +37,7 @@
         InstructorStructuredOutputType,
     )
     from distilabel.steps.tasks.structured_outputs.outlines import StructuredOutputType
-    from distilabel.steps.tasks.typing import ChatType
+    from distilabel.steps.tasks.typing import FormattedInput, StandardInput
     from distilabel.utils.docstring import Docstring
 
 if in_notebook():
@@ -94,7 +94,7 @@ def model_name(self) -> str:
     @abstractmethod
     def generate(
         self,
-        inputs: List["ChatType"],
+        inputs: List["FormattedInput"],
         num_generations: int = 1,
         **kwargs: Any,
     ) -> List["GenerateOutput"]:
@@ -187,7 +187,9 @@ def generate_parsed_docstring(self) -> "Docstring":
         """
         return parse_google_docstring(self.generate)
 
-    def get_last_hidden_states(self, inputs: List["ChatType"]) -> List["HiddenState"]:
+    def get_last_hidden_states(
+        self, inputs: List["StandardInput"]
+    ) -> List["HiddenState"]:
         """Method to get the last hidden states of the model for a list of inputs.
 
         Args:
@@ -264,7 +266,7 @@ def event_loop(self) -> "asyncio.AbstractEventLoop":
 
     @abstractmethod
     async def agenerate(
-        self, input: "ChatType", num_generations: int = 1, **kwargs: Any
+        self, input: "FormattedInput", num_generations: int = 1, **kwargs: Any
     ) -> List[Union[str, None]]:
         """Method to generate a `num_generations` responses for a given input asynchronously,
         and executed concurrently in `generate` method.
@@ -273,7 +275,7 @@ async def agenerate(
 
     def generate(
         self,
-        inputs: List["ChatType"],
+        inputs: List["FormattedInput"],
         num_generations: int = 1,
         **kwargs: Any,
     ) -> List["GenerateOutput"]:
@@ -282,7 +284,7 @@ def generate(
         """
 
         async def agenerate(
-            inputs: List["ChatType"], **kwargs: Any
+            inputs: List["FormattedInput"], **kwargs: Any
         ) -> List[List[Union[str, None]]]:
             """Internal function to parallelize the asynchronous generation of responses."""
             tasks = [

diff --git a/src/distilabel/llms/cohere.py b/src/distilabel/llms/cohere.py
@@ -30,7 +30,7 @@
 
 from distilabel.llms.base import AsyncLLM
 from distilabel.mixins.runtime_parameters import RuntimeParameter
-from distilabel.steps.tasks.typing import ChatType
+from distilabel.steps.tasks.typing import StandardInput
 from distilabel.utils.itertools import grouper
 
 if TYPE_CHECKING:
@@ -132,7 +132,7 @@ def load(self) -> None:
                 self.structured_output = structured_output
 
     def _format_chat_to_cohere(
-        self, input: "ChatType"
+        self, input: "StandardInput"
     ) -> Tuple[Union[str, None], List["ChatMessage"], str]:
         """Formats the chat input to the Cohere Chat API conversational format.
 
@@ -169,7 +169,7 @@ def _format_chat_to_cohere(
     @validate_call
     async def agenerate(  # type: ignore
         self,
-        input: ChatType,
+        input: StandardInput,
         temperature: Optional[float] = None,
         max_tokens: Optional[int] = None,
         k: Optional[int] = None,
@@ -241,15 +241,15 @@ async def agenerate(  # type: ignore
     @override
     def generate(
         self,
-        inputs: List["ChatType"],
+        inputs: List["StandardInput"],
         num_generations: int = 1,
         **kwargs: Any,
     ) -> List["GenerateOutput"]:
         """Method to generate a list of responses asynchronously, returning the output
         synchronously awaiting for the response of each input sent to `agenerate`."""
 
         async def agenerate(
-            inputs: List["ChatType"], **kwargs: Any
+            inputs: List["StandardInput"], **kwargs: Any
         ) -> "GenerateOutput":
             """Internal function to parallelize the asynchronous generation of responses."""
             tasks = [

diff --git a/src/distilabel/llms/groq.py b/src/distilabel/llms/groq.py
@@ -22,7 +22,7 @@
 from distilabel.llms.base import AsyncLLM
 from distilabel.llms.typing import GenerateOutput
 from distilabel.steps.base import RuntimeParameter
-from distilabel.steps.tasks.typing import ChatType
+from distilabel.steps.tasks.typing import StandardInput
 from distilabel.utils.itertools import grouper
 
 if TYPE_CHECKING:
@@ -131,7 +131,7 @@ def model_name(self) -> str:
     @validate_call
     async def agenerate(  # type: ignore
         self,
-        input: ChatType,
+        input: StandardInput,
         seed: Optional[int] = None,
         max_new_tokens: int = 128,
         temperature: float = 1.0,
@@ -188,7 +188,7 @@ async def agenerate(  # type: ignore
     @override
     def generate(
         self,
-        inputs: List["ChatType"],
+        inputs: List["StandardInput"],
         num_generations: int = 1,
         **kwargs: Any,
     ) -> List["GenerateOutput"]:
@@ -197,7 +197,7 @@ def generate(
         """
 
         async def agenerate(
-            inputs: List["ChatType"], **kwargs: Any
+            inputs: List["StandardInput"], **kwargs: Any
         ) -> "GenerateOutput":
             """Internal function to parallelize the asynchronous generation of responses."""
             tasks = [

diff --git a/src/distilabel/llms/huggingface/inference_endpoints.py b/src/distilabel/llms/huggingface/inference_endpoints.py
@@ -31,7 +31,7 @@
 from distilabel.llms.base import AsyncLLM
 from distilabel.llms.typing import GenerateOutput
 from distilabel.mixins.runtime_parameters import RuntimeParameter
-from distilabel.steps.tasks.typing import ChatType
+from distilabel.steps.tasks.typing import FormattedInput, Grammar, StandardInput
 from distilabel.utils.itertools import grouper
 
 if TYPE_CHECKING:
@@ -148,6 +148,11 @@ class InferenceEndpointsLLM(AsyncLLM):
     model_display_name: Optional[str] = None
     use_openai_client: bool = False
 
+    grammar: Optional[RuntimeParameter[Grammar]] = Field(
+        default=None,
+        description="The grammar to use across all the generations.",
+    )
+
     _model_name: Optional[str] = PrivateAttr(default=None)
     _tokenizer: Optional["PreTrainedTokenizer"] = PrivateAttr(default=None)
     _api_key_env_var: str = PrivateAttr(_INFERENCE_ENDPOINTS_API_KEY_ENV_VAR_NAME)
@@ -290,7 +295,7 @@ def model_name(self) -> Union[str, None]:  # type: ignore
 
     async def _openai_agenerate(
         self,
-        input: "ChatType",
+        input: "StandardInput",
         max_new_tokens: int = 128,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
@@ -322,7 +327,7 @@ async def _openai_agenerate(
     @validate_call
     async def agenerate(  # type: ignore
         self,
-        input: ChatType,
+        input: "FormattedInput",
         max_new_tokens: int = 128,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
@@ -379,6 +384,10 @@ async def agenerate(  # type: ignore
                 )
                 stop_sequences = stop_sequences[:4]
 
+        grammar = None
+        if isinstance(input, tuple):
+            input, grammar = input
+
         if self.use_openai_client:
             return await self._openai_agenerate(
                 input=input,
@@ -413,6 +422,9 @@ async def agenerate(  # type: ignore
                 stop_sequences=stop_sequences,
                 return_full_text=return_full_text,
                 watermark=watermark,
+                # NOTE: `self.grammar` applies to all the generations, while `grammar` is intended
+                # to be different per each input, and those are not intended to be used together
+                grammar=grammar or self.grammar,  # type: ignore
                 # NOTE: here to ensure that the cache is not used and a different response is
                 # generated every time
                 seed=seed or random.randint(0, 2147483647),
@@ -429,7 +441,7 @@ async def agenerate(  # type: ignore
     @override
     def generate(
         self,
-        inputs: List["ChatType"],
+        inputs: List["FormattedInput"],
         num_generations: int = 1,
         **kwargs: Any,
     ) -> List["GenerateOutput"]:
@@ -438,7 +450,7 @@ def generate(
         """
 
         async def agenerate(
-            inputs: List["ChatType"], **kwargs: Any
+            inputs: List["FormattedInput"], **kwargs: Any
         ) -> "GenerateOutput":
             """Internal function to parallelize the asynchronous generation of responses."""
             tasks = [

diff --git a/src/distilabel/llms/huggingface/transformers.py b/src/distilabel/llms/huggingface/transformers.py
@@ -21,7 +21,7 @@
 from distilabel.llms.chat_templates import CHATML_TEMPLATE
 from distilabel.llms.mixins import CudaDevicePlacementMixin
 from distilabel.llms.typing import GenerateOutput
-from distilabel.steps.tasks.typing import ChatType
+from distilabel.steps.tasks.typing import StandardInput
 
 if TYPE_CHECKING:
     from transformers import Pipeline
@@ -130,7 +130,7 @@ def model_name(self) -> str:
         """Returns the model name used for the LLM."""
         return self.model
 
-    def prepare_input(self, input: "ChatType") -> str:
+    def prepare_input(self, input: "StandardInput") -> str:
         """Prepares the input by applying the chat template to the input, which is formatted
         as an OpenAI conversation, and adding the generation prompt.
         """
@@ -143,7 +143,7 @@ def prepare_input(self, input: "ChatType") -> str:
     @validate_call
     def generate(  # type: ignore
         self,
-        inputs: List[ChatType],
+        inputs: List[StandardInput],
         num_generations: int = 1,
         max_new_tokens: int = 128,
         temperature: float = 0.1,
@@ -189,7 +189,9 @@ def generate(  # type: ignore
             for output in outputs
         ]
 
-    def get_last_hidden_states(self, inputs: List["ChatType"]) -> List["HiddenState"]:
+    def get_last_hidden_states(
+        self, inputs: List["StandardInput"]
+    ) -> List["HiddenState"]:
         """Gets the last `hidden_states` of the model for the given inputs. It doesn't
         execute the task head.
 

diff --git a/src/distilabel/llms/litellm.py b/src/distilabel/llms/litellm.py
@@ -20,7 +20,7 @@
 from distilabel.llms.base import AsyncLLM
 from distilabel.llms.typing import GenerateOutput
 from distilabel.mixins.runtime_parameters import RuntimeParameter
-from distilabel.steps.tasks.typing import ChatType
+from distilabel.steps.tasks.typing import StandardInput
 
 if TYPE_CHECKING:
     from litellm import Choices
@@ -90,7 +90,7 @@ def model_name(self) -> str:
     @validate_call
     async def agenerate(  # type: ignore
         self,
-        input: ChatType,
+        input: StandardInput,
         num_generations: int = 1,
         functions: Optional[List] = None,
         function_call: Optional[str] = None,

diff --git a/src/distilabel/llms/llamacpp.py b/src/distilabel/llms/llamacpp.py
@@ -19,7 +19,7 @@
 from distilabel.llms.base import LLM
 from distilabel.llms.typing import GenerateOutput
 from distilabel.mixins.runtime_parameters import RuntimeParameter
-from distilabel.steps.tasks.typing import ChatType
+from distilabel.steps.tasks.typing import StandardInput
 
 if TYPE_CHECKING:
     from llama_cpp import CreateChatCompletionResponse, Llama, LogitsProcessorList
@@ -128,7 +128,7 @@ def model_name(self) -> str:
     @validate_call
     def generate(  # type: ignore
         self,
-        inputs: List[ChatType],
+        inputs: List[StandardInput],
         num_generations: int = 1,
         max_new_tokens: int = 128,
         frequency_penalty: float = 0.0,

diff --git a/src/distilabel/llms/mistral.py b/src/distilabel/llms/mistral.py
@@ -22,7 +22,7 @@
 from distilabel.llms.base import AsyncLLM
 from distilabel.llms.typing import GenerateOutput
 from distilabel.mixins.runtime_parameters import RuntimeParameter
-from distilabel.steps.tasks.typing import ChatType
+from distilabel.steps.tasks.typing import StandardInput
 from distilabel.utils.itertools import grouper
 
 if TYPE_CHECKING:
@@ -129,7 +129,7 @@ def model_name(self) -> str:
     @validate_call
     async def agenerate(  # type: ignore
         self,
-        input: ChatType,
+        input: StandardInput,
         max_new_tokens: Optional[int] = None,
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
@@ -180,7 +180,7 @@ async def agenerate(  # type: ignore
     @override
     def generate(
         self,
-        inputs: List["ChatType"],
+        inputs: List["StandardInput"],
         num_generations: int = 1,
         **kwargs: Any,
     ) -> List["GenerateOutput"]:
@@ -189,7 +189,7 @@ def generate(
         """
 
         async def agenerate(
-            inputs: List["ChatType"], **kwargs: Any
+            inputs: List["StandardInput"], **kwargs: Any
         ) -> "GenerateOutput":
             """Internal function to parallelize the asynchronous generation of responses."""
             tasks = [

diff --git a/src/distilabel/llms/ollama.py b/src/distilabel/llms/ollama.py
@@ -19,7 +19,7 @@
 
 from distilabel.llms.base import AsyncLLM
 from distilabel.mixins.runtime_parameters import RuntimeParameter
-from distilabel.steps.tasks.typing import ChatType
+from distilabel.steps.tasks.typing import StandardInput
 
 if TYPE_CHECKING:
     from ollama import AsyncClient
@@ -117,7 +117,7 @@ def model_name(self) -> str:
     @validate_call
     async def agenerate(  # type: ignore
         self,
-        input: ChatType,
+        input: StandardInput,
         num_generations: int = 1,
         format: Literal["", "json"] = "",
         # TODO: include relevant options from `Options` in `agenerate` method.