-
Notifications
You must be signed in to change notification settings - Fork 80
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: trace LLMEvaluator runs #1185
base: main
Are you sure you want to change the base?
Changes from 8 commits
891d999
6e8fb9b
5879497
1b57c8b
40f95be
7aead74
3adfdce
8a35c07
37b6212
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
"""Contains the LLMEvaluator class for building LLM-as-a-judge evaluators.""" | ||
Check notice on line 1 in python/langsmith/evaluation/llm_evaluator.py GitHub Actions / benchmarkBenchmark results
Check notice on line 1 in python/langsmith/evaluation/llm_evaluator.py GitHub Actions / benchmarkComparison against main
|
||
|
||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast | ||
from uuid import uuid4 | ||
|
||
from pydantic import BaseModel | ||
|
||
|
@@ -10,32 +11,66 @@ | |
|
||
|
||
class CategoricalScoreConfig(BaseModel): | ||
"""Configuration for a categorical score.""" | ||
"""Configuration for a categorical score. | ||
|
||
Attributes: | ||
key (str): The feedback key for the evaluator. | ||
choices (List[str]): List of valid categorical values that can be assigned. | ||
description (str): Detailed description provided to the LLM judge of what | ||
this score evaluates. | ||
reasoning_key (Optional[str]): Key used to store the reasoning/explanation | ||
for the score. Defaults to None. | ||
reasoning_description (Optional[str]): Description provided to the LLM judge | ||
of what should be included in the reasoning. Defaults to None. | ||
""" | ||
|
||
key: str | ||
choices: List[str] | ||
description: str | ||
include_explanation: bool = False | ||
explanation_description: Optional[str] = None | ||
reasoning_key: Optional[str] = None | ||
reasoning_description: Optional[str] = None | ||
|
||
|
||
class ContinuousScoreConfig(BaseModel): | ||
"""Configuration for a continuous score.""" | ||
"""Configuration for a continuous numerical score. | ||
|
||
Attributes: | ||
key (str): The feedback key for the evaluator. | ||
min (float): The minimum allowed value for the score. Defaults to 0. | ||
max (float): The maximum allowed value for the score. Defaults to 1. | ||
description (str): Detailed description provided to the LLM judge of what | ||
this score evaluates. | ||
reasoning_key (Optional[str]): Key used to store the reasoning/explanation | ||
for the score. Defaults to None. | ||
reasoning_description (Optional[str]): Description provided to the LLM judge | ||
of what should be included in the reasoning. Defaults to None. | ||
""" | ||
|
||
key: str | ||
min: float = 0 | ||
max: float = 1 | ||
description: str | ||
include_explanation: bool = False | ||
explanation_description: Optional[str] = None | ||
reasoning_key: Optional[str] = None | ||
reasoning_description: Optional[str] = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a breaking change, so we'll need to be able to handle accepting the other value in the init and log a deprecation warning |
||
|
||
|
||
def _create_score_json_schema( | ||
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig], | ||
) -> dict: | ||
properties: Dict[str, Any] = {} | ||
|
||
if score_config.reasoning_key: | ||
properties[score_config.reasoning_key] = { | ||
"type": "string", | ||
"description": ( | ||
"The explanation for the score." | ||
if score_config.reasoning_description is None | ||
else score_config.reasoning_description | ||
), | ||
} | ||
|
||
if isinstance(score_config, CategoricalScoreConfig): | ||
properties["score"] = { | ||
properties["value"] = { | ||
"type": "string", | ||
"enum": score_config.choices, | ||
"description": f"The score for the evaluation, one of " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It isn't a score anymore, is it? It's the selected category. |
||
|
@@ -52,23 +87,24 @@ | |
else: | ||
raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'") | ||
|
||
if score_config.include_explanation: | ||
properties["explanation"] = { | ||
"type": "string", | ||
"description": ( | ||
"The explanation for the score." | ||
if score_config.explanation_description is None | ||
else score_config.explanation_description | ||
), | ||
} | ||
|
||
return { | ||
"title": score_config.key, | ||
"description": score_config.description, | ||
"type": "object", | ||
"properties": properties, | ||
"required": ( | ||
["score", "explanation"] if score_config.include_explanation else ["score"] | ||
[ | ||
( | ||
"value" | ||
if isinstance(score_config, CategoricalScoreConfig) | ||
isahers1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else "score" | ||
), | ||
score_config.reasoning_key, | ||
] | ||
if score_config.reasoning_key | ||
else [ | ||
"value" if isinstance(score_config, CategoricalScoreConfig) else "score" | ||
] | ||
), | ||
} | ||
|
||
|
@@ -194,7 +230,6 @@ | |
"variables other than 'input', 'output', and 'expected'" | ||
) | ||
self.map_variables = map_variables | ||
|
||
self.score_config = score_config | ||
self.score_schema = _create_score_json_schema(self.score_config) | ||
|
||
|
@@ -206,18 +241,27 @@ | |
self, run: Run, example: Optional[Example] = None | ||
) -> Union[EvaluationResult, EvaluationResults]: | ||
"""Evaluate a run.""" | ||
source_run_id = uuid4() | ||
variables = self._prepare_variables(run, example) | ||
output: dict = cast(dict, self.runnable.invoke(variables)) | ||
return self._parse_output(output) | ||
output: dict = cast( | ||
dict, | ||
self.runnable.invoke(variables, config={"run_id": source_run_id}), | ||
) | ||
return self._parse_output(output, str(source_run_id)) | ||
|
||
@warn_beta | ||
async def aevaluate_run( | ||
self, run: Run, example: Optional[Example] = None | ||
) -> Union[EvaluationResult, EvaluationResults]: | ||
"""Asynchronously evaluate a run.""" | ||
source_run_id = uuid4() | ||
variables = self._prepare_variables(run, example) | ||
output: dict = cast(dict, await self.runnable.ainvoke(variables)) | ||
return self._parse_output(output) | ||
output: dict = cast( | ||
dict, | ||
await self.runnable.ainvoke(variables, config={"run_id": source_run_id}), | ||
) | ||
|
||
return self._parse_output(output, str(source_run_id)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Could leave as a UUID |
||
|
||
def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict: | ||
"""Prepare variables for model invocation.""" | ||
|
@@ -276,17 +320,25 @@ | |
|
||
return variables | ||
|
||
def _parse_output(self, output: dict) -> Union[EvaluationResult, EvaluationResults]: | ||
def _parse_output( | ||
self, output: dict, source_run_id: str | ||
) -> Union[EvaluationResult, EvaluationResults]: | ||
"""Parse the model output into an evaluation result.""" | ||
if isinstance(self.score_config, CategoricalScoreConfig): | ||
value = output["score"] | ||
explanation = output.get("explanation", None) | ||
value = output["value"] | ||
explanation = output.get(self.score_config.reasoning_key, None) | ||
return EvaluationResult( | ||
key=self.score_config.key, value=value, comment=explanation | ||
key=self.score_config.key, | ||
value=value, | ||
comment=explanation, | ||
source_run_id=source_run_id, | ||
) | ||
elif isinstance(self.score_config, ContinuousScoreConfig): | ||
score = output["score"] | ||
explanation = output.get("explanation", None) | ||
explanation = output.get(self.score_config.reasoning_key, None) | ||
return EvaluationResult( | ||
key=self.score_config.key, score=score, comment=explanation | ||
key=self.score_config.key, | ||
score=score, | ||
comment=explanation, | ||
source_run_id=source_run_id, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,7 @@ def test_llm_evaluator_init() -> None: | |
key="vagueness", | ||
choices=["Y", "N"], | ||
description="Whether the response is vague. Y for yes, N for no.", | ||
include_explanation=True, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should keep a couple of these tests with include_explanation around for backward compat testing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added some backward compatibility tests |
||
reasoning_key="explanation", | ||
), | ||
) | ||
assert evaluator is not None | ||
|
@@ -25,7 +25,7 @@ def test_llm_evaluator_init() -> None: | |
"description": "Whether the response is vague. Y for yes, N for no.", | ||
"type": "object", | ||
"properties": { | ||
"score": { | ||
"value": { | ||
"type": "string", | ||
"enum": ["Y", "N"], | ||
"description": "The score for the evaluation, one of Y, N.", | ||
|
@@ -35,16 +35,14 @@ def test_llm_evaluator_init() -> None: | |
"description": "The explanation for the score.", | ||
}, | ||
}, | ||
"required": ["score", "explanation"], | ||
"required": ["value", "explanation"], | ||
} | ||
|
||
# Try a continuous score | ||
evaluator = LLMEvaluator( | ||
prompt_template="Rate the response from 0 to 1.\n{input}", | ||
score_config=ContinuousScoreConfig( | ||
key="rating", | ||
description="The rating of the response, from 0 to 1.", | ||
include_explanation=False, | ||
key="rating", description="The rating of the response, from 0 to 1." | ||
), | ||
) | ||
|
||
|
@@ -71,19 +69,15 @@ def test_llm_evaluator_init() -> None: | |
LLMEvaluator( | ||
prompt_template="Rate the response from 0 to 1.\n{input}", | ||
score_config=ContinuousScoreConfig( | ||
key="rating", | ||
description="The rating of the response, from 0 to 1.", | ||
include_explanation=False, | ||
key="rating", description="The rating of the response, from 0 to 1." | ||
), | ||
model_provider="invalid", | ||
) | ||
|
||
evaluator = LLMEvaluator( | ||
prompt_template="Rate the response from 0 to 1.\n{input} {output} {expected}", | ||
score_config=ContinuousScoreConfig( | ||
key="rating", | ||
description="The rating of the response, from 0 to 1.", | ||
include_explanation=False, | ||
key="rating", description="The rating of the response, from 0 to 1." | ||
), | ||
) | ||
assert evaluator is not None | ||
|
@@ -103,9 +97,7 @@ def test_llm_evaluator_init() -> None: | |
evaluator = LLMEvaluator( | ||
prompt_template="Rate the response from 0 to 1.\n{input} {output} {hello}", | ||
score_config=ContinuousScoreConfig( | ||
key="rating", | ||
description="The rating of the response, from 0 to 1.", | ||
include_explanation=False, | ||
key="rating", description="The rating of the response, from 0 to 1." | ||
), | ||
map_variables=lambda run, example: {"hello": "world"}, | ||
) | ||
|
@@ -120,9 +112,7 @@ def test_from_model() -> None: | |
ChatOpenAI(), | ||
prompt_template="Rate the response from 0 to 1.\n{input}", | ||
score_config=ContinuousScoreConfig( | ||
key="rating", | ||
description="The rating of the response, from 0 to 1.", | ||
include_explanation=False, | ||
key="rating", description="The rating of the response, from 0 to 1." | ||
), | ||
) | ||
assert evaluator is not None | ||
|
@@ -165,7 +155,6 @@ async def apredict(inputs: dict) -> dict: | |
choices=["Y", "N"], | ||
description="Whether the output is accurate with respect to " | ||
"the expected output.", | ||
include_explanation=False, | ||
), | ||
) | ||
|
||
|
@@ -183,7 +172,7 @@ async def apredict(inputs: dict) -> dict: | |
choices=["Y", "N"], | ||
description="Whether the output is accurate with respect to " | ||
"the context and question.", | ||
include_explanation=True, | ||
reasoning_key="explanation", | ||
), | ||
map_variables=lambda run, example: { | ||
"context": example.inputs.get("context", "") if example else "", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Say what None means (don't include a reasoning/ CoT field)
Ditto with description (Defaults to ""The explanation for the score."")
I still think we should have a better default than "The explanation for the score." - like "Think step-by-step about what the correct score should be."