Ultrafeedback default structured output (#876)

* Add default structured output for GenerateSentencePair task * Move default behavior to base class * Add docstrings to the methods and move json schemas to the class method * Add tests for default structured outputs in sentence transformers task * Add control for parsing errors on JSON data * Add default structured output for ComplexityScorer task * Add default structured output for QualityScorer task * Add example to the docstrings * Refactor code per code review, to simplify just creating the default schemas * Add extra check to avoid setting the structured output if the method wasn't overriden * Refactor get_structured_output to return just the schema * Add reference for the JSON schema * Refactor get_structured_output to return just the schema * Add default structured output for UltraFeedback task
argilla-io · Aug 9, 2024 · c006ddc · c006ddc
1 parent aa616a1
commit c006ddc
Show file tree

Hide file tree

Showing 2 changed files with 264 additions and 14 deletions.
diff --git a/src/distilabel/steps/tasks/ultrafeedback.py b/src/distilabel/steps/tasks/ultrafeedback.py
@@ -22,8 +22,10 @@
 
 from typing import Any, Dict, List, Literal, Optional, Union
 
+import orjson
 from jinja2 import Template
 from pydantic import PrivateAttr
+from typing_extensions import override
 
 from distilabel.steps.tasks.base import Task
 from distilabel.steps.tasks.typing import ChatType
@@ -74,13 +76,14 @@ class UltraFeedback(Task):
         ultrafeedback = UltraFeedback(
             llm=InferenceEndpointsLLM(
                 model_id="mistralai/Mistral-7B-Instruct-v0.2",
-            )
+            ),
+            use_default_structured_output=False
         )
 
         ultrafeedback.load()
 
         result = next(
-            chat.process(
+            ultrafeedback.process(
                 [
                     {
                         "instruction": "How much is 2+2?",
@@ -101,6 +104,82 @@ class UltraFeedback(Task):
         # ]
         ```
 
+        Rate generations from different LLMs based on the honesty, using the default structured output:
+
+        ```python
+        from distilabel.steps.tasks import UltraFeedback
+        from distilabel.llms.huggingface import InferenceEndpointsLLM
+
+        # Consider this as a placeholder for your actual LLM.
+        ultrafeedback = UltraFeedback(
+            llm=InferenceEndpointsLLM(
+                model_id="meta-llama/Meta-Llama-3.1-70B-Instruct",
+            ),
+            aspect="honesty"
+        )
+
+        ultrafeedback.load()
+
+        result = next(
+            ultrafeedback.process(
+                [
+                    {
+                        "instruction": "How much is 2+2?",
+                        "generations": ["4", "and a car"],
+                    }
+                ]
+            )
+        )
+        # result
+        # [{'instruction': 'How much is 2+2?',
+        # 'generations': ['4', 'and a car'],
+        # 'ratings': [5, 1],
+        # 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',
+        # "The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer."],
+        # 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{"ratings": [\n    5,\n    1\n] \n\n,"rationales": [\n    "The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.",\n    "The response is confidently incorrect, as it provides unrelated information (\'a car\') and does not address the question. The model shows no uncertainty or indication that it does not know the answer."\n] }'},
+        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]
+        ```
+
+        Rate generations from different LLMs based on the helpfulness, using the default structured output:
+
+        ```python
+        from distilabel.steps.tasks import UltraFeedback
+        from distilabel.llms.huggingface import InferenceEndpointsLLM
+
+        # Consider this as a placeholder for your actual LLM.
+        ultrafeedback = UltraFeedback(
+            llm=InferenceEndpointsLLM(
+                model_id="meta-llama/Meta-Llama-3.1-70B-Instruct",
+                generation_kwargs={"max_new_tokens": 512},
+            ),
+            aspect="helpfulness"
+        )
+
+        ultrafeedback.load()
+
+        result = next(
+            ultrafeedback.process(
+                [
+                    {
+                        "instruction": "How much is 2+2?",
+                        "generations": ["4", "and a car"],
+                    }
+                ]
+            )
+        )
+        # result
+        # [{'instruction': 'How much is 2+2?',
+        #   'generations': ['4', 'and a car'],
+        #   'ratings': [1, 5],
+        #   'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',
+        #    'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],
+        #   'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',
+        #    'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],
+        #   'types': [1, 3, 1],
+        #   'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \n  "ratings": [\n    1,\n    5\n  ]\n ,\n  "rationales": [\n    "Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.",\n    "Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question."\n  ]\n ,\n  "rationales_for_rating": [\n    "Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.",\n    "Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question."\n  ]\n ,\n  "types": [\n    1, 3,\n    1\n  ]\n  }'},
+        #   'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]
+        ```
+
     Citations:
 
         ```
@@ -220,6 +299,9 @@ def _format_ratings_rationales_output(
                 "rationales": [None] * len(input["generations"]),
             }
 
+        if self.use_default_structured_output:
+            return self._format_structured_output(output, input)
+
         pattern = r"Rating: (.+?)\nRationale: (.+)"
         sections = output.split("\n\n")
 
@@ -254,6 +336,9 @@ def _format_types_ratings_rationales_output(
                 "rationales-for-ratings": [None] * len(input["generations"]),
             }
 
+        if self.use_default_structured_output:
+            return self._format_structured_output(output, input)
+
         pattern = r"Type: (.+?)\nRationale: (.+?)\nRating: (.+?)\nRationale: (.+)"
 
         sections = output.split("\n\n")
@@ -287,3 +372,109 @@ def _format_types_ratings_rationales_output(
                 }
             )
         return group_dicts(*formatted_outputs)
+
+    @override
+    def get_structured_output(self) -> Dict[str, Any]:
+        """Creates the json schema to be passed to the LLM, to enforce generating
+        a dictionary with the output which can be directly parsed as a python dictionary.
+
+        The schema corresponds to the following:
+
+        ```python
+        from pydantic import BaseModel
+        from typing import List
+
+        class SchemaUltraFeedback(BaseModel):
+            ratings: List[int]
+            rationales: List[str]
+
+        class SchemaUltraFeedbackWithType(BaseModel):
+            types: List[Optional[int]]
+            ratings: List[int]
+            rationales: List[str]
+            rationales_for_rating: List[str]
+        ```
+
+        Returns:
+            JSON Schema of the response to enforce.
+        """
+        if self.aspect in [
+            "honesty",
+            "instruction-following",
+            "overall-rating",
+        ]:
+            return {
+                "properties": {
+                    "ratings": {
+                        "items": {"type": "integer"},
+                        "title": "Ratings",
+                        "type": "array",
+                    },
+                    "rationales": {
+                        "items": {"type": "string"},
+                        "title": "Rationales",
+                        "type": "array",
+                    },
+                },
+                "required": ["ratings", "rationales"],
+                "title": "SchemaUltraFeedback",
+                "type": "object",
+            }
+        return {
+            "properties": {
+                "types": {
+                    "items": {"anyOf": [{"type": "integer"}, {"type": "null"}]},
+                    "title": "Types",
+                    "type": "array",
+                },
+                "ratings": {
+                    "items": {"type": "integer"},
+                    "title": "Ratings",
+                    "type": "array",
+                },
+                "rationales": {
+                    "items": {"type": "string"},
+                    "title": "Rationales",
+                    "type": "array",
+                },
+                "rationales_for_rating": {
+                    "items": {"type": "string"},
+                    "title": "Rationales For Rating",
+                    "type": "array",
+                },
+            },
+            "required": ["types", "ratings", "rationales", "rationales_for_rating"],
+            "title": "SchemaUltraFeedbackWithType",
+            "type": "object",
+        }
+
+    def _format_structured_output(
+        self, output: str, input: Dict[str, Any]
+    ) -> Dict[str, str]:
+        """Parses the structured response, which should correspond to a dictionary
+        with either `positive`, or `positive` and `negative` keys.
+
+        Args:
+            output: The output from the `LLM`.
+
+        Returns:
+            Formatted output.
+        """
+        try:
+            return orjson.loads(output)
+        except orjson.JSONDecodeError:
+            if self.aspect in [
+                "honesty",
+                "instruction-following",
+                "overall-rating",
+            ]:
+                return {
+                    "ratings": [None] * len(input["generations"]),
+                    "rationales": [None] * len(input["generations"]),
+                }
+            return {
+                "ratings": [None] * len(input["generations"]),
+                "rationales": [None] * len(input["generations"]),
+                "types": [None] * len(input["generations"]),
+                "rationales-for-ratings": [None] * len(input["generations"]),
+            }
diff --git a/tests/unit/steps/tasks/test_ultrafeedback.py b/tests/unit/steps/tasks/test_ultrafeedback.py
@@ -12,16 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, List
+from typing import Any, Dict, List, Union
 
+import pytest
 from distilabel.llms.base import LLM
 from distilabel.llms.typing import GenerateOutput
-from distilabel.pipeline.local import Pipeline
 from distilabel.steps.tasks.typing import ChatType
 from distilabel.steps.tasks.ultrafeedback import UltraFeedback
 
 
 class UltraFeedbackLLM(LLM):
+    structured_output: Any = None
+
     def load(self) -> None:
         pass
 
@@ -43,14 +45,11 @@ def generate(
 
 class TestUltraFeedback:
     def test_process_with_simple_aspect(self) -> None:
-        pipeline = Pipeline(name="unit-test-pipeline")
-        llm = UltraFeedbackLLM()
-
         task = UltraFeedback(
             name="ultrafeedback",
             aspect="instruction-following",
-            llm=llm,
-            pipeline=pipeline,
+            llm=UltraFeedbackLLM(),
+            use_default_structured_output=False,
         )
         task.load()
 
@@ -70,14 +69,11 @@ def test_process_with_simple_aspect(self) -> None:
         ]
 
     def test_process_with_complex_aspect(self) -> None:
-        pipeline = Pipeline(name="unit-test-pipeline")
-        llm = UltraFeedbackLLM()
-
         task = UltraFeedback(
             name="ultrafeedback",
             aspect="truthfulness",
-            llm=llm,
-            pipeline=pipeline,
+            llm=UltraFeedbackLLM(),
+            use_default_structured_output=False,
         )
         task.load()
 
@@ -97,3 +93,66 @@ def test_process_with_complex_aspect(self) -> None:
                 },
             }
         ]
+
+    @pytest.mark.parametrize(
+        "output, use_default_structured_output, aspect, expected",
+        [
+            (
+                "{ \n   random\n}",
+                True,
+                "honesty",
+                {"ratings": [None, None], "rationales": [None, None]},
+            ),
+            (
+                '{ \n  "ratings": [\n    1,\n    5\n  ]\n ,\n  "rationales": [\n    "rationale1",\n    "rationale2"\n  ]}',
+                True,
+                "honesty",
+                {"ratings": [1, 5], "rationales": ["rationale1", "rationale2"]},
+            ),
+            (
+                "{ \n   random\n}",
+                True,
+                "helpfulness",
+                {
+                    "ratings": [None, None],
+                    "rationales": [None, None],
+                    "rationales-for-ratings": [None, None],
+                    "types": [None, None],
+                },
+            ),
+            (
+                '{ \n  "ratings": [\n    1,\n    5\n  ]\n ,\n  "rationales": [\n    "rationale1",\n    "rationale2"\n  ], "rationales-for-ratings": [\n    "rationale1",\n    "rationale2"\n  ], "types": [\n    1,\n    2\n  ]}',
+                True,
+                "helpfulness",
+                {
+                    "ratings": [1, 5],
+                    "rationales": ["rationale1", "rationale2"],
+                    "rationales-for-ratings": ["rationale1", "rationale2"],
+                    "types": [1, 2],
+                },
+            ),
+        ],
+    )
+    def test_format_output(
+        self,
+        output: Union[str, None],
+        use_default_structured_output: bool,
+        aspect: str,
+        expected: Dict[str, Any],
+    ) -> None:
+        task = UltraFeedback(
+            llm=UltraFeedbackLLM(),
+            aspect=aspect,
+            use_default_structured_output=use_default_structured_output,
+        )
+        task.load()
+
+        result = task.format_output(
+            output=output,
+            input={
+                "instruction": "How much is 2+2?",
+                "generations": ["4", "something weird"],
+            },
+        )
+
+        assert result == expected