From 7e901add39412b480c037eb17b06f1450b1948fd Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Fri, 8 Nov 2024 17:38:13 -0800
Subject: [PATCH 1/5] rfc: accept simple evaluators

---
 python/langsmith/evaluation/_runner.py   |  1 +
 python/langsmith/evaluation/evaluator.py | 44 +++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index 111986b76..de910bd58 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -86,6 +86,7 @@
         [schemas.Run, Optional[schemas.Example]],
         Union[EvaluationResult, EvaluationResults],
     ],
+    Callable[..., Union[dict, EvaluationResults, EvaluationResult]],
 ]
 AEVALUATOR_T = Union[
     Callable[
diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 065f5b16b..9d398f1c4 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -17,7 +17,7 @@
     cast,
 )
 
-from typing_extensions import TypedDict
+from typing_extensions import TypedDict, get_type_hints
 
 try:
     from pydantic.v1 import (  # type: ignore[import]
@@ -194,6 +194,8 @@ def __init__(
             func (Callable): A function that takes a `Run` and an optional `Example` as
             arguments, and returns a dict or `ComparisonEvaluationResult`.
         """
+        func = _normalize_evaluator_func(func)
+
         wraps(func)(self)
         from langsmith import run_helpers  # type: ignore
 
@@ -632,3 +634,43 @@ def comparison_evaluator(
 ) -> DynamicComparisonRunEvaluator:
     """Create a comaprison evaluator from a function."""
     return DynamicComparisonRunEvaluator(func)
+
+
+def _normalize_evaluator_func(
+    func: Callable,
+) -> Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT]:
+    # for backwards compatibility, if args are untyped we assume they correspond to
+    # Run and Example:
+    if not (type_hints := get_type_hints(func)):
+        return func
+    elif {Run, Example, Optional[Example]}.intersection(type_hints.values()):
+        return func
+    else:
+        sig = inspect.signature(func)
+        num_positional = len(
+            [
+                p
+                for p in sig.parameters.values()
+                if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY)
+            ]
+        )
+        has_positional_var = any(
+            p.kind == p.VAR_POSITIONAL for p in sig.parameters.values()
+        )
+        if not (
+            num_positional in (2, 3) or (num_positional <= 3 and has_positional_var)
+        ):
+            msg = ""
+            raise ValueError(msg)
+
+        def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
+            args = (example.inputs, run.outputs, example.outputs)
+            if has_positional_var:
+                return func(*args)
+            else:
+                return func(*args[:num_positional])
+
+        wrapper.__name__ = (
+            getattr(func, "__name__") if hasattr(func, "__name__") else wrapper.__name__
+        )
+        return wrapper

From 9b26f6b72c8808d230e0f53a2385520b33ea79ed Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Mon, 11 Nov 2024 15:08:36 -0800
Subject: [PATCH 2/5] Update python/langsmith/evaluation/evaluator.py

Co-authored-by: William FH <13333726+hinthornw@users.noreply.github.com>
---
 python/langsmith/evaluation/evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 9d398f1c4..351e6acf7 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -664,7 +664,7 @@ def _normalize_evaluator_func(
             raise ValueError(msg)
 
         def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
-            args = (example.inputs, run.outputs, example.outputs)
+            args = (example.inputs, run.outputs or {}, example.outputs or {})
             if has_positional_var:
                 return func(*args)
             else:

From b3b841f2a36d1f6bcac61e7b8f12144d11407fd0 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Mon, 11 Nov 2024 17:52:46 -0800
Subject: [PATCH 3/5] fmt

---
 python/langsmith/evaluation/evaluator.py      | 53 ++++++++++++++-----
 .../unit_tests/evaluation/test_runner.py      | 47 +++++++++++++---
 2 files changed, 80 insertions(+), 20 deletions(-)

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 351e6acf7..082d3ffe7 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -195,6 +195,8 @@ def __init__(
             arguments, and returns a dict or `ComparisonEvaluationResult`.
         """
         func = _normalize_evaluator_func(func)
+        if afunc:
+            afunc = _normalize_evaluator_func(afunc)  # type: ignore[assignment]
 
         wraps(func)(self)
         from langsmith import run_helpers  # type: ignore
@@ -638,7 +640,10 @@ def comparison_evaluator(
 
 def _normalize_evaluator_func(
     func: Callable,
-) -> Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT]:
+) -> Union[
+    Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT],
+    Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]],
+]:
     # for backwards compatibility, if args are untyped we assume they correspond to
     # Run and Example:
     if not (type_hints := get_type_hints(func)):
@@ -660,17 +665,41 @@ def _normalize_evaluator_func(
         if not (
             num_positional in (2, 3) or (num_positional <= 3 and has_positional_var)
         ):
-            msg = ""
+            msg = (
+                "Invalid evaluator function. Expected to take either 2 or 3 positional "
+                "arguments. Please see "
+                "https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"  # noqa: E501
+            )
             raise ValueError(msg)
 
-        def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
-            args = (example.inputs, run.outputs or {}, example.outputs or {})
-            if has_positional_var:
-                return func(*args)
-            else:
-                return func(*args[:num_positional])
+        if inspect.iscoroutinefunction(func):
 
-        wrapper.__name__ = (
-            getattr(func, "__name__") if hasattr(func, "__name__") else wrapper.__name__
-        )
-        return wrapper
+            async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
+                args = (example.inputs, run.outputs or {}, example.outputs or {})
+                if has_positional_var:
+                    return await func(*args)
+                else:
+                    return await func(*args[:num_positional])
+
+            awrapper.__name__ = (
+                getattr(func, "__name__")
+                if hasattr(func, "__name__")
+                else awrapper.__name__
+            )
+            return awrapper  # type: ignore[return-value]
+
+        else:
+
+            def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
+                args = (example.inputs, run.outputs or {}, example.outputs or {})
+                if has_positional_var:
+                    return func(*args)
+                else:
+                    return func(*args[:num_positional])
+
+            wrapper.__name__ = (
+                getattr(func, "__name__")
+                if hasattr(func, "__name__")
+                else wrapper.__name__
+            )
+            return wrapper  # type: ignore[return-value]
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index d20960d3e..744ffcfcc 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -184,11 +184,26 @@ def score_value_first(run, example):
         ordering_of_stuff.append("evaluate")
         return {"score": 0.3}
 
+    def score_unpacked_inputs_outputs(inputs: dict, outputs: dict):
+        ordering_of_stuff.append("evaluate")
+        return {"score": outputs["output"]}
+
+    def score_unpacked_inputs_outputs_reference(
+        inputs: dict, outputs: dict, reference_outputs: dict
+    ):
+        ordering_of_stuff.append("evaluate")
+        return {"score": reference_outputs["answer"]}
+
+    evaluators = [
+        score_value_first,
+        score_unpacked_inputs_outputs,
+        score_unpacked_inputs_outputs_reference,
+    ]
     results = evaluate(
         predict,
         client=client,
         data=dev_split,
-        evaluators=[score_value_first],
+        evaluators=evaluators,
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
     )
@@ -219,14 +234,14 @@ def score_value_first(run, example):
     assert fake_request.created_session
     _wait_until(lambda: fake_request.runs)
     N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
-    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 2)
+    _wait_until(lambda: len(ordering_of_stuff) == (N_PREDS * (len(evaluators) + 1)))
     _wait_until(lambda: slow_index is not None)
     # Want it to be interleaved
-    assert ordering_of_stuff != ["predict"] * N_PREDS + ["evaluate"] * N_PREDS
+    assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
 
     # It's delayed, so it'll be the penultimate event
     # Will run all other preds and evals, then this, then the last eval
-    assert slow_index == (N_PREDS * 2) - 2
+    assert slow_index == (len(evaluators) + 1) * (N_PREDS - 1)
 
     def score_value(run, example):
         return {"score": 0.7}
@@ -347,11 +362,27 @@ async def score_value_first(run, example):
         ordering_of_stuff.append("evaluate")
         return {"score": 0.3}
 
+    async def score_unpacked_inputs_outputs(inputs: dict, outputs: dict):
+        ordering_of_stuff.append("evaluate")
+        return {"score": outputs["output"]}
+
+    async def score_unpacked_inputs_outputs_reference(
+        inputs: dict, outputs: dict, reference_outputs: dict
+    ):
+        ordering_of_stuff.append("evaluate")
+        return {"score": reference_outputs["answer"]}
+
+    evaluators = [
+        score_value_first,
+        score_unpacked_inputs_outputs,
+        score_unpacked_inputs_outputs_reference,
+    ]
+
     results = await aevaluate(
         predict,
         client=client,
         data=dev_split,
-        evaluators=[score_value_first],
+        evaluators=evaluators,
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
     )
@@ -387,14 +418,14 @@ async def score_value_first(run, example):
     assert fake_request.created_session
     _wait_until(lambda: fake_request.runs)
     N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
-    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 2)
+    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * (len(evaluators) + 1))
     _wait_until(lambda: slow_index is not None)
     # Want it to be interleaved
-    assert ordering_of_stuff != ["predict"] * N_PREDS + ["evaluate"] * N_PREDS
+    assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
     assert slow_index is not None
     # It's delayed, so it'll be the penultimate event
     # Will run all other preds and evals, then this, then the last eval
-    assert slow_index == (N_PREDS * 2) - 2
+    assert slow_index == (N_PREDS - 1) * (len(evaluators) + 1)
 
     assert fake_request.created_session["name"]
 

From 03506c5765a84c09008af03004c492cd14ea9e13 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Thu, 14 Nov 2024 11:16:36 -0800
Subject: [PATCH 4/5] cr

---
 python/langsmith/evaluation/evaluator.py | 78 +++++++++++++-----------
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 082d3ffe7..6571451b3 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -17,7 +17,7 @@
     cast,
 )
 
-from typing_extensions import TypedDict, get_type_hints
+from typing_extensions import TypedDict
 
 try:
     from pydantic.v1 import (  # type: ignore[import]
@@ -644,42 +644,42 @@ def _normalize_evaluator_func(
     Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT],
     Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]],
 ]:
-    # for backwards compatibility, if args are untyped we assume they correspond to
-    # Run and Example:
-    if not (type_hints := get_type_hints(func)):
-        return func
-    elif {Run, Example, Optional[Example]}.intersection(type_hints.values()):
+    supported_args = ("run", "example", "inputs", "outputs", "reference_outputs")
+    sig = inspect.signature(func)
+    positional_no_default = [
+        pname
+        for pname, p in sig.parameters.items()
+        if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY)
+        and p.default is p.empty
+    ]
+    if (
+        not all(pname in supported_args for pname in positional_no_default)
+        and len(positional_no_default) != 2
+    ):
+        msg = (
+            f"Invalid evaluator function argument names. Supported positional args are "
+            f"{supported_args}. Please see "
+            "https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"
+            # noqa: E501
+        )
+        raise ValueError(msg)
+    elif not all(pname in supported_args for pname in positional_no_default):
+        # For backwards compatibility we assume custom arg names are Run and Example
+        # types, respectively.
         return func
     else:
-        sig = inspect.signature(func)
-        num_positional = len(
-            [
-                p
-                for p in sig.parameters.values()
-                if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY)
-            ]
-        )
-        has_positional_var = any(
-            p.kind == p.VAR_POSITIONAL for p in sig.parameters.values()
-        )
-        if not (
-            num_positional in (2, 3) or (num_positional <= 3 and has_positional_var)
-        ):
-            msg = (
-                "Invalid evaluator function. Expected to take either 2 or 3 positional "
-                "arguments. Please see "
-                "https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"  # noqa: E501
-            )
-            raise ValueError(msg)
-
         if inspect.iscoroutinefunction(func):
 
             async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
-                args = (example.inputs, run.outputs or {}, example.outputs or {})
-                if has_positional_var:
-                    return await func(*args)
-                else:
-                    return await func(*args[:num_positional])
+                arg_map = {
+                    "run": run,
+                    "example": example,
+                    "inputs": example.inputs,
+                    "outputs": run.outputs or {},
+                    "reference_outputs": example.outputs or {},
+                }
+                args = (arg_map[arg] for arg in positional_no_default)
+                return await func(*args)
 
             awrapper.__name__ = (
                 getattr(func, "__name__")
@@ -691,11 +691,15 @@ async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
         else:
 
             def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
-                args = (example.inputs, run.outputs or {}, example.outputs or {})
-                if has_positional_var:
-                    return func(*args)
-                else:
-                    return func(*args[:num_positional])
+                arg_map = {
+                    "run": run,
+                    "example": example,
+                    "inputs": example.inputs,
+                    "outputs": run.outputs or {},
+                    "reference_outputs": example.outputs or {},
+                }
+                args = (arg_map[arg] for arg in positional_no_default)
+                return func(*args)
 
             wrapper.__name__ = (
                 getattr(func, "__name__")

From 448bbf396f5a74e5d8349679650f8550b62a3093 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Thu, 14 Nov 2024 13:02:33 -0800
Subject: [PATCH 5/5] fmt

---
 python/langsmith/evaluation/evaluator.py      |  8 +-
 .../unit_tests/evaluation/test_runner.py      | 94 ++++++++++++++-----
 2 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 01469878a..e00a3a260 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -665,14 +665,14 @@ def _normalize_evaluator_func(
         if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY)
         and p.default is p.empty
     ]
-    if (
+    if not positional_no_default or (
         not all(pname in supported_args for pname in positional_no_default)
         and len(positional_no_default) != 2
     ):
         msg = (
-            f"Invalid evaluator function argument names. Supported positional args are "
-            f"{supported_args}. Please see "
-            "https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"
+            f"Invalid evaluator function. Must have at least one positional "
+            f"argument. Supported positional arguments are {supported_args}. Please "
+            f"see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"
             # noqa: E501
         )
         raise ValueError(msg)
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index 68467945e..fdae50474 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -21,6 +21,7 @@
 from langsmith.client import Client
 from langsmith.evaluation._arunner import aevaluate, aevaluate_existing
 from langsmith.evaluation._runner import evaluate_existing
+from langsmith.evaluation.evaluator import _normalize_evaluator_func
 
 
 class FakeRequest:
@@ -120,6 +121,16 @@ def _wait_until(condition: Callable, timeout: int = 8):
     raise TimeoutError("Condition not met")
 
 
+def _create_example(idx: int) -> ls_schemas.Example:
+    return ls_schemas.Example(
+        id=uuid.uuid4(),
+        inputs={"in": idx},
+        outputs={"answer": idx + 1},
+        dataset_id="00886375-eb2a-4038-9032-efff60309896",
+        created_at=datetime.now(timezone.utc),
+    )
+
+
 @pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
 @pytest.mark.parametrize("blocking", [False, True])
 @pytest.mark.parametrize("as_runnable", [False, True])
@@ -128,15 +139,6 @@ def test_evaluate_results(blocking: bool, as_runnable: bool) -> None:
     ds_name = "my-dataset"
     ds_id = "00886375-eb2a-4038-9032-efff60309896"
 
-    def _create_example(idx: int) -> ls_schemas.Example:
-        return ls_schemas.Example(
-            id=uuid.uuid4(),
-            inputs={"in": idx},
-            outputs={"answer": idx + 1},
-            dataset_id=ds_id,
-            created_at=datetime.now(timezone.utc),
-        )
-
     SPLIT_SIZE = 3
     NUM_REPETITIONS = 4
     ds_examples = [_create_example(i) for i in range(10)]
@@ -196,13 +198,11 @@ def score_value_first(run, example):
         ordering_of_stuff.append("evaluate")
         return {"score": 0.3}
 
-    def score_unpacked_inputs_outputs(inputs: dict, outputs: dict):
+    def score_unpacked_inputs_outputs(inputs, outputs):
         ordering_of_stuff.append("evaluate")
         return {"score": outputs["output"]}
 
-    def score_unpacked_inputs_outputs_reference(
-        inputs: dict, outputs: dict, reference_outputs: dict
-    ):
+    def score_unpacked_inputs_outputs_reference(inputs, outputs, reference_outputs):
         ordering_of_stuff.append("evaluate")
         return {"score": reference_outputs["answer"]}
 
@@ -347,15 +347,6 @@ async def test_aevaluate_results(blocking: bool, as_runnable: bool) -> None:
     ds_name = "my-dataset"
     ds_id = "00886375-eb2a-4038-9032-efff60309896"
 
-    def _create_example(idx: int) -> ls_schemas.Example:
-        return ls_schemas.Example(
-            id=uuid.uuid4(),
-            inputs={"in": idx},
-            outputs={"answer": idx + 1},
-            dataset_id=ds_id,
-            created_at=datetime.now(timezone.utc),
-        )
-
     SPLIT_SIZE = 3
     NUM_REPETITIONS = 4
     ds_examples = [_create_example(i) for i in range(10)]
@@ -416,12 +407,12 @@ async def score_value_first(run, example):
         ordering_of_stuff.append("evaluate")
         return {"score": 0.3}
 
-    async def score_unpacked_inputs_outputs(inputs: dict, outputs: dict):
+    async def score_unpacked_inputs_outputs(inputs, outputs):
         ordering_of_stuff.append("evaluate")
         return {"score": outputs["output"]}
 
     async def score_unpacked_inputs_outputs_reference(
-        inputs: dict, outputs: dict, reference_outputs: dict
+        inputs, outputs, reference_outputs
     ):
         ordering_of_stuff.append("evaluate")
         return {"score": reference_outputs["answer"]}
@@ -531,3 +522,58 @@ async def bad_eval_list(run, example):
     )
     async for r in results:
         assert r["evaluation_results"]["results"][0].extra == {"error": True}
+
+
+@pytest.mark.parametrize("sync", [True, False])
+async def test_evaluate_invalid_evaluator_signature(sync: bool) -> None:
+    ds_name = "my-dataset"
+    ds_id = "00886375-eb2a-4038-9032-efff60309896"
+
+    session = mock.Mock()
+    tenant_id = str(uuid.uuid4())
+    ds_examples = [_create_example(0)]
+    fake_request = FakeRequest(ds_id, ds_name, ds_examples, tenant_id)
+    session.request = fake_request.request
+    client = Client(
+        api_url="http://localhost:1984",
+        api_key="123",
+        session=session,
+        info=ls_schemas.LangSmithInfo(
+            batch_ingest_config=ls_schemas.BatchIngestConfig(
+                size_limit_bytes=None,  # Note this field is not used here
+                size_limit=100,
+                scale_up_nthreads_limit=16,
+                scale_up_qsize_trigger=1000,
+                scale_down_nempty_trigger=4,
+            )
+        ),
+    )
+    client._tenant_id = tenant_id  # type: ignore
+
+    # args need to be positional
+    async def eval1(*, inputs, outputs):
+        pass
+
+    # if more than 2 positional args, they must all have default arg names
+    # (run, example, ...)
+    async def eval2(x, y, inputs):
+        pass
+
+    evaluators = [eval1, eval2]
+
+    async def atarget(x):
+        return x
+
+    for eval_ in evaluators:
+        with pytest.raises(ValueError, match="Invalid evaluator function."):
+            _normalize_evaluator_func(eval_)
+
+        with pytest.raises(ValueError, match="Invalid evaluator function."):
+            if sync:
+                evaluate(
+                    (lambda x: x), data=ds_examples, evaluators=[eval_], client=client
+                )
+            else:
+                await aevaluate(
+                    atarget, data=ds_examples, evaluators=[eval_], client=client
+                )