Rename for_text_quality to for_overall_quality method in `UltraFe…

…edbackTask` (#224) * Rename `for_text_quality`->`for_overall_quality` and add docstring * Update docs with `UltraFeedbackTask.for_overall_quality` * Apply rename `for_text_quality`->`for_overall_quality`
argilla-io · Jan 8, 2024 · 39fb34b · 39fb34b
1 parent 300a67d
commit 39fb34b
Show file tree

Hide file tree

Showing 7 changed files with 23 additions and 16 deletions.
diff --git a/...eference/tasks/openai_for_text_quality.py → ...rence/tasks/openai_for_overall_quality.py b/...eference/tasks/openai_for_text_quality.py → ...rence/tasks/openai_for_overall_quality.py
@@ -4,6 +4,6 @@
 from distilabel.tasks import UltraFeedbackTask
 
 labeller = OpenAILLM(
-    task=UltraFeedbackTask.for_text_quality(),
+    task=UltraFeedbackTask.for_overall_quality(),
     openai_api_key=os.getenv("OPENAI_API_KEY"),
 )
diff --git a/docs/technical-reference/tasks.md b/docs/technical-reference/tasks.md
@@ -76,14 +76,6 @@ The following snippet can be used as a simplified UltraFeedback Task, for which
 --8<-- "docs/snippets/technical-reference/tasks/ultrafeedback.py"
 ```
 
-=== "Text Quality"
-
-    The following example uses a `LLM` to examinate the data for text quality criteria, which includes the different criteria from UltraFeedback (Correctness & Informativeness, Honesty & Uncertainty, Truthfulness & Hallucination and Instruction Following):
-
-    ```python
-    --8<-- "docs/snippets/technical-reference/tasks/openai_for_text_quality.py"
-    ```
-
 === "Helpfulness"
 
     The following example creates a UltraFeedback task to emphasize helpfulness, that is overall quality and correctness of the output:
@@ -116,6 +108,17 @@ The following snippet can be used as a simplified UltraFeedback Task, for which
     --8<-- "docs/snippets/technical-reference/tasks/openai_for_instruction_following.py"
     ```
 
+Additionally, we at Argilla created a custom subtask for UltraFeedback, that generates an overall score evaluating all the aspects mentioned above but within a single subtask. Otherwise, in order to get an overall score, all the subtasks above should be run and the average of those scores to be calculated. 
+
+=== "Overall Quality"
+
+    The following example uses a `LLM` to examinate the data for our custom overall quality criteria, which includes the different criteria from UltraFeedback (Correctness & Informativeness, Honesty & Uncertainty, Truthfulness & Hallucination and Instruction Following):
+
+    ```python
+    --8<-- "docs/snippets/technical-reference/tasks/openai_for_overall_quality.py"
+    ```
+
+
 For the API reference visit [UltraFeedbackTask][distilabel.tasks.preference.ultrafeedback.UltraFeedbackTask].
 
 #### JudgeLMTask

diff --git a/examples/pipeline-llamacpp-and-openai-process.py b/examples/pipeline-llamacpp-and-openai-process.py
@@ -63,7 +63,7 @@ def load_openai_llm(task: "Task") -> "LLM":
     pipeline = Pipeline(
         generator=ProcessLLM(task=TextGenerationTask(), load_llm_fn=load_llama_cpp_llm),
         labeller=ProcessLLM(
-            task=UltraFeedbackTask.for_text_quality(), load_llm_fn=load_openai_llm
+            task=UltraFeedbackTask.for_overall_quality(), load_llm_fn=load_openai_llm
         ),
     )
 

diff --git a/examples/pipeline-llamacpp-and-openai.py b/examples/pipeline-llamacpp-and-openai.py
@@ -39,7 +39,7 @@
         ),
         labeller=OpenAILLM(
             model="gpt-3.5-turbo",
-            task=UltraFeedbackTask.for_text_quality(),
+            task=UltraFeedbackTask.for_overall_quality(),
             max_new_tokens=128,
             num_threads=2,
             openai_api_key="<OPENAI_API_KEY>",

diff --git a/examples/pipeline-vllm-and-openai.py b/examples/pipeline-vllm-and-openai.py
@@ -37,7 +37,7 @@
         ),
         labeller=OpenAILLM(
             model="gpt-3.5-turbo",
-            task=UltraFeedbackTask.for_text_quality(),
+            task=UltraFeedbackTask.for_overall_quality(),
             max_new_tokens=128,
             num_threads=2,
             openai_api_key=os.getenv("OPENAI_API_KEY", None),

diff --git a/src/distilabel/pipeline.py b/src/distilabel/pipeline.py
@@ -78,7 +78,7 @@ def __init__(
             ... )
             >>> labeller = OpenAILLM(
             ...     model="gpt-3.5-turbo",
-            ...     task=UltraFeedbackTask.for_text_quality(),
+            ...     task=UltraFeedbackTask.for_overall_quality(),
             ... )
             >>> pipeline = Pipeline(generator=generator, labeller=labeller)
             >>> dataset = pipeline.generate(dataset=..., num_generations=1, batch_size=1)
@@ -714,7 +714,7 @@ def generate(
             ... )
             >>> labeller = OpenAILLM(
             ...     model="gpt-3.5-turbo",
-            ...     task=UltraFeedbackTask.for_text_quality(),
+            ...     task=UltraFeedbackTask.for_overall_quality(),
             ... )
             >>> pipeline = Pipeline(generator=generator, labeller=labeller)
             >>> dataset = pipeline.generate(dataset=..., num_generations=1, batch_size=1)

diff --git a/src/distilabel/tasks/preference/ultrafeedback.py b/src/distilabel/tasks/preference/ultrafeedback.py
@@ -92,7 +92,7 @@ def generate_prompt(self, input: str, generations: List[str], **_: Any) -> Promp
 
         Examples:
             >>> from distilabel.tasks.preference import UltraFeedbackTask
-            >>> task = UltraFeedbackTask.for_text_quality()
+            >>> task = UltraFeedbackTask.for_overall_quality()
             >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
             Prompt(
                 system_prompt="Your role is to evaluate text quality based on given criteria.",
@@ -142,12 +142,16 @@ def to_argilla_dataset(
         )
 
     @classmethod
-    def for_text_quality(
+    def for_overall_quality(
         cls,
         system_prompt: Optional[str] = None,
         task_description: Optional[str] = None,
         ratings: Optional[List[Rating]] = None,
     ) -> "UltraFeedbackTask":
+        """Classmethod for the `UltraFeedbackTask` subtask defined by Argilla, in order to
+        evaluate all the criterias originally defined in UltraFeedback at once, in a single
+        subtask.
+        """
         kwargs = {}
         if system_prompt is not None:
             kwargs.update({"system_prompt": system_prompt})