diff --git a/docs/snippets/technical-reference/tasks/openai_for_text_quality.py b/docs/snippets/technical-reference/tasks/openai_for_overall_quality.py similarity index 77% rename from docs/snippets/technical-reference/tasks/openai_for_text_quality.py rename to docs/snippets/technical-reference/tasks/openai_for_overall_quality.py index bd893dd779..10f3ff594f 100644 --- a/docs/snippets/technical-reference/tasks/openai_for_text_quality.py +++ b/docs/snippets/technical-reference/tasks/openai_for_overall_quality.py @@ -4,6 +4,6 @@ from distilabel.tasks import UltraFeedbackTask labeller = OpenAILLM( - task=UltraFeedbackTask.for_text_quality(), + task=UltraFeedbackTask.for_overall_quality(), openai_api_key=os.getenv("OPENAI_API_KEY"), ) diff --git a/docs/technical-reference/tasks.md b/docs/technical-reference/tasks.md index 0134779178..885db2128a 100644 --- a/docs/technical-reference/tasks.md +++ b/docs/technical-reference/tasks.md @@ -76,14 +76,6 @@ The following snippet can be used as a simplified UltraFeedback Task, for which --8<-- "docs/snippets/technical-reference/tasks/ultrafeedback.py" ``` -=== "Text Quality" - - The following example uses a `LLM` to examinate the data for text quality criteria, which includes the different criteria from UltraFeedback (Correctness & Informativeness, Honesty & Uncertainty, Truthfulness & Hallucination and Instruction Following): - - ```python - --8<-- "docs/snippets/technical-reference/tasks/openai_for_text_quality.py" - ``` - === "Helpfulness" The following example creates a UltraFeedback task to emphasize helpfulness, that is overall quality and correctness of the output: @@ -116,6 +108,17 @@ The following snippet can be used as a simplified UltraFeedback Task, for which --8<-- "docs/snippets/technical-reference/tasks/openai_for_instruction_following.py" ``` +Additionally, we at Argilla created a custom subtask for UltraFeedback, that generates an overall score evaluating all the aspects mentioned above but within a single subtask. Otherwise, in order to get an overall score, all the subtasks above should be run and the average of those scores to be calculated. + +=== "Overall Quality" + + The following example uses a `LLM` to examinate the data for our custom overall quality criteria, which includes the different criteria from UltraFeedback (Correctness & Informativeness, Honesty & Uncertainty, Truthfulness & Hallucination and Instruction Following): + + ```python + --8<-- "docs/snippets/technical-reference/tasks/openai_for_overall_quality.py" + ``` + + For the API reference visit [UltraFeedbackTask][distilabel.tasks.preference.ultrafeedback.UltraFeedbackTask]. #### JudgeLMTask diff --git a/examples/pipeline-llamacpp-and-openai-process.py b/examples/pipeline-llamacpp-and-openai-process.py index df5f8cf11f..10102d08ab 100644 --- a/examples/pipeline-llamacpp-and-openai-process.py +++ b/examples/pipeline-llamacpp-and-openai-process.py @@ -63,7 +63,7 @@ def load_openai_llm(task: "Task") -> "LLM": pipeline = Pipeline( generator=ProcessLLM(task=TextGenerationTask(), load_llm_fn=load_llama_cpp_llm), labeller=ProcessLLM( - task=UltraFeedbackTask.for_text_quality(), load_llm_fn=load_openai_llm + task=UltraFeedbackTask.for_overall_quality(), load_llm_fn=load_openai_llm ), ) diff --git a/examples/pipeline-llamacpp-and-openai.py b/examples/pipeline-llamacpp-and-openai.py index 564de142fd..3f609b0128 100644 --- a/examples/pipeline-llamacpp-and-openai.py +++ b/examples/pipeline-llamacpp-and-openai.py @@ -39,7 +39,7 @@ ), labeller=OpenAILLM( model="gpt-3.5-turbo", - task=UltraFeedbackTask.for_text_quality(), + task=UltraFeedbackTask.for_overall_quality(), max_new_tokens=128, num_threads=2, openai_api_key="", diff --git a/examples/pipeline-vllm-and-openai.py b/examples/pipeline-vllm-and-openai.py index 03b29bf92a..e9f49b69e9 100644 --- a/examples/pipeline-vllm-and-openai.py +++ b/examples/pipeline-vllm-and-openai.py @@ -37,7 +37,7 @@ ), labeller=OpenAILLM( model="gpt-3.5-turbo", - task=UltraFeedbackTask.for_text_quality(), + task=UltraFeedbackTask.for_overall_quality(), max_new_tokens=128, num_threads=2, openai_api_key=os.getenv("OPENAI_API_KEY", None), diff --git a/src/distilabel/pipeline.py b/src/distilabel/pipeline.py index 3096283a62..5430fc36c8 100644 --- a/src/distilabel/pipeline.py +++ b/src/distilabel/pipeline.py @@ -78,7 +78,7 @@ def __init__( ... ) >>> labeller = OpenAILLM( ... model="gpt-3.5-turbo", - ... task=UltraFeedbackTask.for_text_quality(), + ... task=UltraFeedbackTask.for_overall_quality(), ... ) >>> pipeline = Pipeline(generator=generator, labeller=labeller) >>> dataset = pipeline.generate(dataset=..., num_generations=1, batch_size=1) @@ -714,7 +714,7 @@ def generate( ... ) >>> labeller = OpenAILLM( ... model="gpt-3.5-turbo", - ... task=UltraFeedbackTask.for_text_quality(), + ... task=UltraFeedbackTask.for_overall_quality(), ... ) >>> pipeline = Pipeline(generator=generator, labeller=labeller) >>> dataset = pipeline.generate(dataset=..., num_generations=1, batch_size=1) diff --git a/src/distilabel/tasks/preference/ultrafeedback.py b/src/distilabel/tasks/preference/ultrafeedback.py index 7583d085a5..ecadac4b12 100644 --- a/src/distilabel/tasks/preference/ultrafeedback.py +++ b/src/distilabel/tasks/preference/ultrafeedback.py @@ -92,7 +92,7 @@ def generate_prompt(self, input: str, generations: List[str], **_: Any) -> Promp Examples: >>> from distilabel.tasks.preference import UltraFeedbackTask - >>> task = UltraFeedbackTask.for_text_quality() + >>> task = UltraFeedbackTask.for_overall_quality() >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"]) Prompt( system_prompt="Your role is to evaluate text quality based on given criteria.", @@ -142,12 +142,16 @@ def to_argilla_dataset( ) @classmethod - def for_text_quality( + def for_overall_quality( cls, system_prompt: Optional[str] = None, task_description: Optional[str] = None, ratings: Optional[List[Rating]] = None, ) -> "UltraFeedbackTask": + """Classmethod for the `UltraFeedbackTask` subtask defined by Argilla, in order to + evaluate all the criterias originally defined in UltraFeedback at once, in a single + subtask. + """ kwargs = {} if system_prompt is not None: kwargs.update({"system_prompt": system_prompt})