langchain-ai · isahers1 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 7, 2024
diff --git a/python/langsmith/evaluation/llm_evaluator.py b/python/langsmith/evaluation/llm_evaluator.py
@@ -1,6 +1,7 @@
 """Contains the LLMEvaluator class for building LLM-as-a-judge evaluators."""
 
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+from uuid import uuid4
 
 from pydantic import BaseModel
 
@@ -10,32 +11,66 @@
 
 
 class CategoricalScoreConfig(BaseModel):
-    """Configuration for a categorical score."""
+    """Configuration for a categorical score.
+
+    Attributes:
+        key (str): The feedback key for the evaluator.
+        choices (List[str]): List of valid categorical values that can be assigned.
+        description (str): Detailed description provided to the LLM judge of what
+            this score evaluates.
+        reasoning_key (Optional[str]): Key used to store the reasoning/explanation
+            for the score. Defaults to None.
+        reasoning_description (Optional[str]): Description provided to the LLM judge
+            of what should be included in the reasoning. Defaults to None.
+    """
 
     key: str
     choices: List[str]
     description: str
-    include_explanation: bool = False
-    explanation_description: Optional[str] = None
+    reasoning_key: Optional[str] = None
+    reasoning_description: Optional[str] = None
 
 
 class ContinuousScoreConfig(BaseModel):
-    """Configuration for a continuous score."""
+    """Configuration for a continuous numerical score.
+
+    Attributes:
+        key (str): The feedback key for the evaluator.
+        min (float): The minimum allowed value for the score. Defaults to 0.
+        max (float): The maximum allowed value for the score. Defaults to 1.
+        description (str): Detailed description provided to the LLM judge of what
+            this score evaluates.
+        reasoning_key (Optional[str]): Key used to store the reasoning/explanation
+            for the score. Defaults to None.
+        reasoning_description (Optional[str]): Description provided to the LLM judge
+            of what should be included in the reasoning. Defaults to None.
+    """
 
     key: str
     min: float = 0
     max: float = 1
     description: str
-    include_explanation: bool = False
-    explanation_description: Optional[str] = None
+    reasoning_key: Optional[str] = None
+    reasoning_description: Optional[str] = None
 
 
 def _create_score_json_schema(
     score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
 ) -> dict:
     properties: Dict[str, Any] = {}
+
+    if score_config.reasoning_key:
+        properties[score_config.reasoning_key] = {
+            "type": "string",
+            "description": (
+                "The explanation for the score."
+                if score_config.reasoning_description is None
+                else score_config.reasoning_description
+            ),
+        }
+
     if isinstance(score_config, CategoricalScoreConfig):
-        properties["score"] = {
+        properties["value"] = {
             "type": "string",
             "enum": score_config.choices,
             "description": f"The score for the evaluation, one of "
@@ -52,23 +87,24 @@
     else:
         raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'")
 
-    if score_config.include_explanation:
-        properties["explanation"] = {
-            "type": "string",
-            "description": (
-                "The explanation for the score."
-                if score_config.explanation_description is None
-                else score_config.explanation_description
-            ),
-        }
-
     return {
         "title": score_config.key,
         "description": score_config.description,
         "type": "object",
         "properties": properties,
         "required": (
-            ["score", "explanation"] if score_config.include_explanation else ["score"]
+            [
+                (
+                    "value"
+                    if isinstance(score_config, CategoricalScoreConfig)
+                    else "score"
+                ),
+                score_config.reasoning_key,
+            ]
+            if score_config.reasoning_key
+            else [
+                "value" if isinstance(score_config, CategoricalScoreConfig) else "score"
+            ]
         ),
     }
 
@@ -194,7 +230,6 @@
                     "variables other than 'input', 'output', and 'expected'"
                 )
         self.map_variables = map_variables
-
         self.score_config = score_config
         self.score_schema = _create_score_json_schema(self.score_config)
 
@@ -206,18 +241,27 @@
         self, run: Run, example: Optional[Example] = None
     ) -> Union[EvaluationResult, EvaluationResults]:
         """Evaluate a run."""
+        source_run_id = uuid4()
         variables = self._prepare_variables(run, example)
-        output: dict = cast(dict, self.runnable.invoke(variables))
-        return self._parse_output(output)
+        output: dict = cast(
+            dict,
+            self.runnable.invoke(variables, config={"run_id": source_run_id}),
+        )
+        return self._parse_output(output, str(source_run_id))
 
     @warn_beta
     async def aevaluate_run(
         self, run: Run, example: Optional[Example] = None
     ) -> Union[EvaluationResult, EvaluationResults]:
         """Asynchronously evaluate a run."""
+        source_run_id = uuid4()
         variables = self._prepare_variables(run, example)
-        output: dict = cast(dict, await self.runnable.ainvoke(variables))
-        return self._parse_output(output)
+        output: dict = cast(
+            dict,
+            await self.runnable.ainvoke(variables, config={"run_id": source_run_id}),
+        )
+
+        return self._parse_output(output, str(source_run_id))
 
     def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
         """Prepare variables for model invocation."""
@@ -276,17 +320,25 @@
 
         return variables
 
-    def _parse_output(self, output: dict) -> Union[EvaluationResult, EvaluationResults]:
+    def _parse_output(
+        self, output: dict, source_run_id: str
+    ) -> Union[EvaluationResult, EvaluationResults]:
         """Parse the model output into an evaluation result."""
         if isinstance(self.score_config, CategoricalScoreConfig):
-            value = output["score"]
-            explanation = output.get("explanation", None)
+            value = output["value"]
+            explanation = output.get(self.score_config.reasoning_key, None)
             return EvaluationResult(
-                key=self.score_config.key, value=value, comment=explanation
+                key=self.score_config.key,
+                value=value,
+                comment=explanation,
+                source_run_id=source_run_id,
             )
         elif isinstance(self.score_config, ContinuousScoreConfig):
             score = output["score"]
-            explanation = output.get("explanation", None)
+            explanation = output.get(self.score_config.reasoning_key, None)
             return EvaluationResult(
-                key=self.score_config.key, score=score, comment=explanation
+                key=self.score_config.key,
+                score=score,
+                comment=explanation,
+                source_run_id=source_run_id,
             )
diff --git a/python/tests/integration_tests/test_llm_evaluator.py b/python/tests/integration_tests/test_llm_evaluator.py
@@ -15,7 +15,7 @@ def test_llm_evaluator_init() -> None:
             key="vagueness",
             choices=["Y", "N"],
             description="Whether the response is vague. Y for yes, N for no.",
-            include_explanation=True,
+            reasoning_key="explanation",
         ),
     )
     assert evaluator is not None
@@ -25,7 +25,7 @@ def test_llm_evaluator_init() -> None:
         "description": "Whether the response is vague. Y for yes, N for no.",
         "type": "object",
         "properties": {
-            "score": {
+            "value": {
                 "type": "string",
                 "enum": ["Y", "N"],
                 "description": "The score for the evaluation, one of Y, N.",
@@ -35,16 +35,14 @@ def test_llm_evaluator_init() -> None:
                 "description": "The explanation for the score.",
             },
         },
-        "required": ["score", "explanation"],
+        "required": ["value", "explanation"],
     }
 
     # Try a continuous score
     evaluator = LLMEvaluator(
         prompt_template="Rate the response from 0 to 1.\n{input}",
         score_config=ContinuousScoreConfig(
-            key="rating",
-            description="The rating of the response, from 0 to 1.",
-            include_explanation=False,
+            key="rating", description="The rating of the response, from 0 to 1."
         ),
     )
 
@@ -71,19 +69,15 @@ def test_llm_evaluator_init() -> None:
         LLMEvaluator(
             prompt_template="Rate the response from 0 to 1.\n{input}",
             score_config=ContinuousScoreConfig(
-                key="rating",
-                description="The rating of the response, from 0 to 1.",
-                include_explanation=False,
+                key="rating", description="The rating of the response, from 0 to 1."
             ),
             model_provider="invalid",
         )
 
     evaluator = LLMEvaluator(
         prompt_template="Rate the response from 0 to 1.\n{input} {output} {expected}",
         score_config=ContinuousScoreConfig(
-            key="rating",
-            description="The rating of the response, from 0 to 1.",
-            include_explanation=False,
+            key="rating", description="The rating of the response, from 0 to 1."
         ),
     )
     assert evaluator is not None
@@ -103,9 +97,7 @@ def test_llm_evaluator_init() -> None:
     evaluator = LLMEvaluator(
         prompt_template="Rate the response from 0 to 1.\n{input} {output} {hello}",
         score_config=ContinuousScoreConfig(
-            key="rating",
-            description="The rating of the response, from 0 to 1.",
-            include_explanation=False,
+            key="rating", description="The rating of the response, from 0 to 1."
         ),
         map_variables=lambda run, example: {"hello": "world"},
     )
@@ -120,9 +112,7 @@ def test_from_model() -> None:
         ChatOpenAI(),
         prompt_template="Rate the response from 0 to 1.\n{input}",
         score_config=ContinuousScoreConfig(
-            key="rating",
-            description="The rating of the response, from 0 to 1.",
-            include_explanation=False,
+            key="rating", description="The rating of the response, from 0 to 1."
         ),
     )
     assert evaluator is not None
@@ -165,7 +155,6 @@ async def apredict(inputs: dict) -> dict:
             choices=["Y", "N"],
             description="Whether the output is accurate with respect to "
             "the expected output.",
-            include_explanation=False,
         ),
     )
 
@@ -183,7 +172,7 @@ async def apredict(inputs: dict) -> dict:
             choices=["Y", "N"],
             description="Whether the output is accurate with respect to "
             "the context and question.",
-            include_explanation=True,
+            reasoning_key="explanation",
         ),
         map_variables=lambda run, example: {
             "context": example.inputs.get("context", "") if example else "",