Skip to content

Commit

Permalink
Ultrafeedback default structured output (#876)
Browse files Browse the repository at this point in the history
* Add default structured output for GenerateSentencePair task

* Move default behavior to base class

* Add docstrings to the methods and move json schemas to the class method

* Add tests for default structured outputs in sentence transformers task

* Add control for parsing errors on JSON data

* Add default structured output for ComplexityScorer task

* Add default structured output for QualityScorer task

* Add example to the docstrings

* Refactor code per code review, to simplify just creating the default schemas

* Add extra check to avoid setting the structured output if the method wasn't overriden

* Refactor get_structured_output to return just the schema

* Add reference for the JSON schema

* Refactor get_structured_output to return just the schema

* Add default structured output for UltraFeedback task
  • Loading branch information
plaguss authored Aug 9, 2024
1 parent aa616a1 commit c006ddc
Show file tree
Hide file tree
Showing 2 changed files with 264 additions and 14 deletions.
195 changes: 193 additions & 2 deletions src/distilabel/steps/tasks/ultrafeedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@

from typing import Any, Dict, List, Literal, Optional, Union

import orjson
from jinja2 import Template
from pydantic import PrivateAttr
from typing_extensions import override

from distilabel.steps.tasks.base import Task
from distilabel.steps.tasks.typing import ChatType
Expand Down Expand Up @@ -74,13 +76,14 @@ class UltraFeedback(Task):
ultrafeedback = UltraFeedback(
llm=InferenceEndpointsLLM(
model_id="mistralai/Mistral-7B-Instruct-v0.2",
)
),
use_default_structured_output=False
)
ultrafeedback.load()
result = next(
chat.process(
ultrafeedback.process(
[
{
"instruction": "How much is 2+2?",
Expand All @@ -101,6 +104,82 @@ class UltraFeedback(Task):
# ]
```
Rate generations from different LLMs based on the honesty, using the default structured output:
```python
from distilabel.steps.tasks import UltraFeedback
from distilabel.llms.huggingface import InferenceEndpointsLLM
# Consider this as a placeholder for your actual LLM.
ultrafeedback = UltraFeedback(
llm=InferenceEndpointsLLM(
model_id="meta-llama/Meta-Llama-3.1-70B-Instruct",
),
aspect="honesty"
)
ultrafeedback.load()
result = next(
ultrafeedback.process(
[
{
"instruction": "How much is 2+2?",
"generations": ["4", "and a car"],
}
]
)
)
# result
# [{'instruction': 'How much is 2+2?',
# 'generations': ['4', 'and a car'],
# 'ratings': [5, 1],
# 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',
# "The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer."],
# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{"ratings": [\n 5,\n 1\n] \n\n,"rationales": [\n "The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.",\n "The response is confidently incorrect, as it provides unrelated information (\'a car\') and does not address the question. The model shows no uncertainty or indication that it does not know the answer."\n] }'},
# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]
```
Rate generations from different LLMs based on the helpfulness, using the default structured output:
```python
from distilabel.steps.tasks import UltraFeedback
from distilabel.llms.huggingface import InferenceEndpointsLLM
# Consider this as a placeholder for your actual LLM.
ultrafeedback = UltraFeedback(
llm=InferenceEndpointsLLM(
model_id="meta-llama/Meta-Llama-3.1-70B-Instruct",
generation_kwargs={"max_new_tokens": 512},
),
aspect="helpfulness"
)
ultrafeedback.load()
result = next(
ultrafeedback.process(
[
{
"instruction": "How much is 2+2?",
"generations": ["4", "and a car"],
}
]
)
)
# result
# [{'instruction': 'How much is 2+2?',
# 'generations': ['4', 'and a car'],
# 'ratings': [1, 5],
# 'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',
# 'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],
# 'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',
# 'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],
# 'types': [1, 3, 1],
# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \n "ratings": [\n 1,\n 5\n ]\n ,\n "rationales": [\n "Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.",\n "Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question."\n ]\n ,\n "rationales_for_rating": [\n "Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.",\n "Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question."\n ]\n ,\n "types": [\n 1, 3,\n 1\n ]\n }'},
# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]
```
Citations:
```
Expand Down Expand Up @@ -220,6 +299,9 @@ def _format_ratings_rationales_output(
"rationales": [None] * len(input["generations"]),
}

if self.use_default_structured_output:
return self._format_structured_output(output, input)

pattern = r"Rating: (.+?)\nRationale: (.+)"
sections = output.split("\n\n")

Expand Down Expand Up @@ -254,6 +336,9 @@ def _format_types_ratings_rationales_output(
"rationales-for-ratings": [None] * len(input["generations"]),
}

if self.use_default_structured_output:
return self._format_structured_output(output, input)

pattern = r"Type: (.+?)\nRationale: (.+?)\nRating: (.+?)\nRationale: (.+)"

sections = output.split("\n\n")
Expand Down Expand Up @@ -287,3 +372,109 @@ def _format_types_ratings_rationales_output(
}
)
return group_dicts(*formatted_outputs)

@override
def get_structured_output(self) -> Dict[str, Any]:
"""Creates the json schema to be passed to the LLM, to enforce generating
a dictionary with the output which can be directly parsed as a python dictionary.
The schema corresponds to the following:
```python
from pydantic import BaseModel
from typing import List
class SchemaUltraFeedback(BaseModel):
ratings: List[int]
rationales: List[str]
class SchemaUltraFeedbackWithType(BaseModel):
types: List[Optional[int]]
ratings: List[int]
rationales: List[str]
rationales_for_rating: List[str]
```
Returns:
JSON Schema of the response to enforce.
"""
if self.aspect in [
"honesty",
"instruction-following",
"overall-rating",
]:
return {
"properties": {
"ratings": {
"items": {"type": "integer"},
"title": "Ratings",
"type": "array",
},
"rationales": {
"items": {"type": "string"},
"title": "Rationales",
"type": "array",
},
},
"required": ["ratings", "rationales"],
"title": "SchemaUltraFeedback",
"type": "object",
}
return {
"properties": {
"types": {
"items": {"anyOf": [{"type": "integer"}, {"type": "null"}]},
"title": "Types",
"type": "array",
},
"ratings": {
"items": {"type": "integer"},
"title": "Ratings",
"type": "array",
},
"rationales": {
"items": {"type": "string"},
"title": "Rationales",
"type": "array",
},
"rationales_for_rating": {
"items": {"type": "string"},
"title": "Rationales For Rating",
"type": "array",
},
},
"required": ["types", "ratings", "rationales", "rationales_for_rating"],
"title": "SchemaUltraFeedbackWithType",
"type": "object",
}

def _format_structured_output(
self, output: str, input: Dict[str, Any]
) -> Dict[str, str]:
"""Parses the structured response, which should correspond to a dictionary
with either `positive`, or `positive` and `negative` keys.
Args:
output: The output from the `LLM`.
Returns:
Formatted output.
"""
try:
return orjson.loads(output)
except orjson.JSONDecodeError:
if self.aspect in [
"honesty",
"instruction-following",
"overall-rating",
]:
return {
"ratings": [None] * len(input["generations"]),
"rationales": [None] * len(input["generations"]),
}
return {
"ratings": [None] * len(input["generations"]),
"rationales": [None] * len(input["generations"]),
"types": [None] * len(input["generations"]),
"rationales-for-ratings": [None] * len(input["generations"]),
}
83 changes: 71 additions & 12 deletions tests/unit/steps/tasks/test_ultrafeedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, List
from typing import Any, Dict, List, Union

import pytest
from distilabel.llms.base import LLM
from distilabel.llms.typing import GenerateOutput
from distilabel.pipeline.local import Pipeline
from distilabel.steps.tasks.typing import ChatType
from distilabel.steps.tasks.ultrafeedback import UltraFeedback


class UltraFeedbackLLM(LLM):
structured_output: Any = None

def load(self) -> None:
pass

Expand All @@ -43,14 +45,11 @@ def generate(

class TestUltraFeedback:
def test_process_with_simple_aspect(self) -> None:
pipeline = Pipeline(name="unit-test-pipeline")
llm = UltraFeedbackLLM()

task = UltraFeedback(
name="ultrafeedback",
aspect="instruction-following",
llm=llm,
pipeline=pipeline,
llm=UltraFeedbackLLM(),
use_default_structured_output=False,
)
task.load()

Expand All @@ -70,14 +69,11 @@ def test_process_with_simple_aspect(self) -> None:
]

def test_process_with_complex_aspect(self) -> None:
pipeline = Pipeline(name="unit-test-pipeline")
llm = UltraFeedbackLLM()

task = UltraFeedback(
name="ultrafeedback",
aspect="truthfulness",
llm=llm,
pipeline=pipeline,
llm=UltraFeedbackLLM(),
use_default_structured_output=False,
)
task.load()

Expand All @@ -97,3 +93,66 @@ def test_process_with_complex_aspect(self) -> None:
},
}
]

@pytest.mark.parametrize(
"output, use_default_structured_output, aspect, expected",
[
(
"{ \n random\n}",
True,
"honesty",
{"ratings": [None, None], "rationales": [None, None]},
),
(
'{ \n "ratings": [\n 1,\n 5\n ]\n ,\n "rationales": [\n "rationale1",\n "rationale2"\n ]}',
True,
"honesty",
{"ratings": [1, 5], "rationales": ["rationale1", "rationale2"]},
),
(
"{ \n random\n}",
True,
"helpfulness",
{
"ratings": [None, None],
"rationales": [None, None],
"rationales-for-ratings": [None, None],
"types": [None, None],
},
),
(
'{ \n "ratings": [\n 1,\n 5\n ]\n ,\n "rationales": [\n "rationale1",\n "rationale2"\n ], "rationales-for-ratings": [\n "rationale1",\n "rationale2"\n ], "types": [\n 1,\n 2\n ]}',
True,
"helpfulness",
{
"ratings": [1, 5],
"rationales": ["rationale1", "rationale2"],
"rationales-for-ratings": ["rationale1", "rationale2"],
"types": [1, 2],
},
),
],
)
def test_format_output(
self,
output: Union[str, None],
use_default_structured_output: bool,
aspect: str,
expected: Dict[str, Any],
) -> None:
task = UltraFeedback(
llm=UltraFeedbackLLM(),
aspect=aspect,
use_default_structured_output=use_default_structured_output,
)
task.load()

result = task.format_output(
output=output,
input={
"instruction": "How much is 2+2?",
"generations": ["4", "something weird"],
},
)

assert result == expected

0 comments on commit c006ddc

Please sign in to comment.