From 7e901add39412b480c037eb17b06f1450b1948fd Mon Sep 17 00:00:00 2001 From: Bagatur Date: Fri, 8 Nov 2024 17:38:13 -0800 Subject: [PATCH 1/5] rfc: accept simple evaluators --- python/langsmith/evaluation/_runner.py | 1 + python/langsmith/evaluation/evaluator.py | 44 +++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index 111986b76..de910bd58 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -86,6 +86,7 @@ [schemas.Run, Optional[schemas.Example]], Union[EvaluationResult, EvaluationResults], ], + Callable[..., Union[dict, EvaluationResults, EvaluationResult]], ] AEVALUATOR_T = Union[ Callable[ diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 065f5b16b..9d398f1c4 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -17,7 +17,7 @@ cast, ) -from typing_extensions import TypedDict +from typing_extensions import TypedDict, get_type_hints try: from pydantic.v1 import ( # type: ignore[import] @@ -194,6 +194,8 @@ def __init__( func (Callable): A function that takes a `Run` and an optional `Example` as arguments, and returns a dict or `ComparisonEvaluationResult`. """ + func = _normalize_evaluator_func(func) + wraps(func)(self) from langsmith import run_helpers # type: ignore @@ -632,3 +634,43 @@ def comparison_evaluator( ) -> DynamicComparisonRunEvaluator: """Create a comaprison evaluator from a function.""" return DynamicComparisonRunEvaluator(func) + + +def _normalize_evaluator_func( + func: Callable, +) -> Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT]: + # for backwards compatibility, if args are untyped we assume they correspond to + # Run and Example: + if not (type_hints := get_type_hints(func)): + return func + elif {Run, Example, Optional[Example]}.intersection(type_hints.values()): + return func + else: + sig = inspect.signature(func) + num_positional = len( + [ + p + for p in sig.parameters.values() + if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY) + ] + ) + has_positional_var = any( + p.kind == p.VAR_POSITIONAL for p in sig.parameters.values() + ) + if not ( + num_positional in (2, 3) or (num_positional <= 3 and has_positional_var) + ): + msg = "" + raise ValueError(msg) + + def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: + args = (example.inputs, run.outputs, example.outputs) + if has_positional_var: + return func(*args) + else: + return func(*args[:num_positional]) + + wrapper.__name__ = ( + getattr(func, "__name__") if hasattr(func, "__name__") else wrapper.__name__ + ) + return wrapper From 9b26f6b72c8808d230e0f53a2385520b33ea79ed Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Mon, 11 Nov 2024 15:08:36 -0800 Subject: [PATCH 2/5] Update python/langsmith/evaluation/evaluator.py Co-authored-by: William FH <13333726+hinthornw@users.noreply.github.com> --- python/langsmith/evaluation/evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 9d398f1c4..351e6acf7 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -664,7 +664,7 @@ def _normalize_evaluator_func( raise ValueError(msg) def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: - args = (example.inputs, run.outputs, example.outputs) + args = (example.inputs, run.outputs or {}, example.outputs or {}) if has_positional_var: return func(*args) else: From b3b841f2a36d1f6bcac61e7b8f12144d11407fd0 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 11 Nov 2024 17:52:46 -0800 Subject: [PATCH 3/5] fmt --- python/langsmith/evaluation/evaluator.py | 53 ++++++++++++++----- .../unit_tests/evaluation/test_runner.py | 47 +++++++++++++--- 2 files changed, 80 insertions(+), 20 deletions(-) diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 351e6acf7..082d3ffe7 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -195,6 +195,8 @@ def __init__( arguments, and returns a dict or `ComparisonEvaluationResult`. """ func = _normalize_evaluator_func(func) + if afunc: + afunc = _normalize_evaluator_func(afunc) # type: ignore[assignment] wraps(func)(self) from langsmith import run_helpers # type: ignore @@ -638,7 +640,10 @@ def comparison_evaluator( def _normalize_evaluator_func( func: Callable, -) -> Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT]: +) -> Union[ + Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT], + Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]], +]: # for backwards compatibility, if args are untyped we assume they correspond to # Run and Example: if not (type_hints := get_type_hints(func)): @@ -660,17 +665,41 @@ def _normalize_evaluator_func( if not ( num_positional in (2, 3) or (num_positional <= 3 and has_positional_var) ): - msg = "" + msg = ( + "Invalid evaluator function. Expected to take either 2 or 3 positional " + "arguments. Please see " + "https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators" # noqa: E501 + ) raise ValueError(msg) - def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: - args = (example.inputs, run.outputs or {}, example.outputs or {}) - if has_positional_var: - return func(*args) - else: - return func(*args[:num_positional]) + if inspect.iscoroutinefunction(func): - wrapper.__name__ = ( - getattr(func, "__name__") if hasattr(func, "__name__") else wrapper.__name__ - ) - return wrapper + async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: + args = (example.inputs, run.outputs or {}, example.outputs or {}) + if has_positional_var: + return await func(*args) + else: + return await func(*args[:num_positional]) + + awrapper.__name__ = ( + getattr(func, "__name__") + if hasattr(func, "__name__") + else awrapper.__name__ + ) + return awrapper # type: ignore[return-value] + + else: + + def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: + args = (example.inputs, run.outputs or {}, example.outputs or {}) + if has_positional_var: + return func(*args) + else: + return func(*args[:num_positional]) + + wrapper.__name__ = ( + getattr(func, "__name__") + if hasattr(func, "__name__") + else wrapper.__name__ + ) + return wrapper # type: ignore[return-value] diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py index d20960d3e..744ffcfcc 100644 --- a/python/tests/unit_tests/evaluation/test_runner.py +++ b/python/tests/unit_tests/evaluation/test_runner.py @@ -184,11 +184,26 @@ def score_value_first(run, example): ordering_of_stuff.append("evaluate") return {"score": 0.3} + def score_unpacked_inputs_outputs(inputs: dict, outputs: dict): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + + def score_unpacked_inputs_outputs_reference( + inputs: dict, outputs: dict, reference_outputs: dict + ): + ordering_of_stuff.append("evaluate") + return {"score": reference_outputs["answer"]} + + evaluators = [ + score_value_first, + score_unpacked_inputs_outputs, + score_unpacked_inputs_outputs_reference, + ] results = evaluate( predict, client=client, data=dev_split, - evaluators=[score_value_first], + evaluators=evaluators, num_repetitions=NUM_REPETITIONS, blocking=blocking, ) @@ -219,14 +234,14 @@ def score_value_first(run, example): assert fake_request.created_session _wait_until(lambda: fake_request.runs) N_PREDS = SPLIT_SIZE * NUM_REPETITIONS - _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 2) + _wait_until(lambda: len(ordering_of_stuff) == (N_PREDS * (len(evaluators) + 1))) _wait_until(lambda: slow_index is not None) # Want it to be interleaved - assert ordering_of_stuff != ["predict"] * N_PREDS + ["evaluate"] * N_PREDS + assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS # It's delayed, so it'll be the penultimate event # Will run all other preds and evals, then this, then the last eval - assert slow_index == (N_PREDS * 2) - 2 + assert slow_index == (len(evaluators) + 1) * (N_PREDS - 1) def score_value(run, example): return {"score": 0.7} @@ -347,11 +362,27 @@ async def score_value_first(run, example): ordering_of_stuff.append("evaluate") return {"score": 0.3} + async def score_unpacked_inputs_outputs(inputs: dict, outputs: dict): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + + async def score_unpacked_inputs_outputs_reference( + inputs: dict, outputs: dict, reference_outputs: dict + ): + ordering_of_stuff.append("evaluate") + return {"score": reference_outputs["answer"]} + + evaluators = [ + score_value_first, + score_unpacked_inputs_outputs, + score_unpacked_inputs_outputs_reference, + ] + results = await aevaluate( predict, client=client, data=dev_split, - evaluators=[score_value_first], + evaluators=evaluators, num_repetitions=NUM_REPETITIONS, blocking=blocking, ) @@ -387,14 +418,14 @@ async def score_value_first(run, example): assert fake_request.created_session _wait_until(lambda: fake_request.runs) N_PREDS = SPLIT_SIZE * NUM_REPETITIONS - _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 2) + _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * (len(evaluators) + 1)) _wait_until(lambda: slow_index is not None) # Want it to be interleaved - assert ordering_of_stuff != ["predict"] * N_PREDS + ["evaluate"] * N_PREDS + assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS assert slow_index is not None # It's delayed, so it'll be the penultimate event # Will run all other preds and evals, then this, then the last eval - assert slow_index == (N_PREDS * 2) - 2 + assert slow_index == (N_PREDS - 1) * (len(evaluators) + 1) assert fake_request.created_session["name"] From 03506c5765a84c09008af03004c492cd14ea9e13 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Thu, 14 Nov 2024 11:16:36 -0800 Subject: [PATCH 4/5] cr --- python/langsmith/evaluation/evaluator.py | 78 +++++++++++++----------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 082d3ffe7..6571451b3 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -17,7 +17,7 @@ cast, ) -from typing_extensions import TypedDict, get_type_hints +from typing_extensions import TypedDict try: from pydantic.v1 import ( # type: ignore[import] @@ -644,42 +644,42 @@ def _normalize_evaluator_func( Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT], Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]], ]: - # for backwards compatibility, if args are untyped we assume they correspond to - # Run and Example: - if not (type_hints := get_type_hints(func)): - return func - elif {Run, Example, Optional[Example]}.intersection(type_hints.values()): + supported_args = ("run", "example", "inputs", "outputs", "reference_outputs") + sig = inspect.signature(func) + positional_no_default = [ + pname + for pname, p in sig.parameters.items() + if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY) + and p.default is p.empty + ] + if ( + not all(pname in supported_args for pname in positional_no_default) + and len(positional_no_default) != 2 + ): + msg = ( + f"Invalid evaluator function argument names. Supported positional args are " + f"{supported_args}. Please see " + "https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators" + # noqa: E501 + ) + raise ValueError(msg) + elif not all(pname in supported_args for pname in positional_no_default): + # For backwards compatibility we assume custom arg names are Run and Example + # types, respectively. return func else: - sig = inspect.signature(func) - num_positional = len( - [ - p - for p in sig.parameters.values() - if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY) - ] - ) - has_positional_var = any( - p.kind == p.VAR_POSITIONAL for p in sig.parameters.values() - ) - if not ( - num_positional in (2, 3) or (num_positional <= 3 and has_positional_var) - ): - msg = ( - "Invalid evaluator function. Expected to take either 2 or 3 positional " - "arguments. Please see " - "https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators" # noqa: E501 - ) - raise ValueError(msg) - if inspect.iscoroutinefunction(func): async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: - args = (example.inputs, run.outputs or {}, example.outputs or {}) - if has_positional_var: - return await func(*args) - else: - return await func(*args[:num_positional]) + arg_map = { + "run": run, + "example": example, + "inputs": example.inputs, + "outputs": run.outputs or {}, + "reference_outputs": example.outputs or {}, + } + args = (arg_map[arg] for arg in positional_no_default) + return await func(*args) awrapper.__name__ = ( getattr(func, "__name__") @@ -691,11 +691,15 @@ async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: else: def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: - args = (example.inputs, run.outputs or {}, example.outputs or {}) - if has_positional_var: - return func(*args) - else: - return func(*args[:num_positional]) + arg_map = { + "run": run, + "example": example, + "inputs": example.inputs, + "outputs": run.outputs or {}, + "reference_outputs": example.outputs or {}, + } + args = (arg_map[arg] for arg in positional_no_default) + return func(*args) wrapper.__name__ = ( getattr(func, "__name__") From 448bbf396f5a74e5d8349679650f8550b62a3093 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Thu, 14 Nov 2024 13:02:33 -0800 Subject: [PATCH 5/5] fmt --- python/langsmith/evaluation/evaluator.py | 8 +- .../unit_tests/evaluation/test_runner.py | 94 ++++++++++++++----- 2 files changed, 74 insertions(+), 28 deletions(-) diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 01469878a..e00a3a260 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -665,14 +665,14 @@ def _normalize_evaluator_func( if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY) and p.default is p.empty ] - if ( + if not positional_no_default or ( not all(pname in supported_args for pname in positional_no_default) and len(positional_no_default) != 2 ): msg = ( - f"Invalid evaluator function argument names. Supported positional args are " - f"{supported_args}. Please see " - "https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators" + f"Invalid evaluator function. Must have at least one positional " + f"argument. Supported positional arguments are {supported_args}. Please " + f"see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators" # noqa: E501 ) raise ValueError(msg) diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py index 68467945e..fdae50474 100644 --- a/python/tests/unit_tests/evaluation/test_runner.py +++ b/python/tests/unit_tests/evaluation/test_runner.py @@ -21,6 +21,7 @@ from langsmith.client import Client from langsmith.evaluation._arunner import aevaluate, aevaluate_existing from langsmith.evaluation._runner import evaluate_existing +from langsmith.evaluation.evaluator import _normalize_evaluator_func class FakeRequest: @@ -120,6 +121,16 @@ def _wait_until(condition: Callable, timeout: int = 8): raise TimeoutError("Condition not met") +def _create_example(idx: int) -> ls_schemas.Example: + return ls_schemas.Example( + id=uuid.uuid4(), + inputs={"in": idx}, + outputs={"answer": idx + 1}, + dataset_id="00886375-eb2a-4038-9032-efff60309896", + created_at=datetime.now(timezone.utc), + ) + + @pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher") @pytest.mark.parametrize("blocking", [False, True]) @pytest.mark.parametrize("as_runnable", [False, True]) @@ -128,15 +139,6 @@ def test_evaluate_results(blocking: bool, as_runnable: bool) -> None: ds_name = "my-dataset" ds_id = "00886375-eb2a-4038-9032-efff60309896" - def _create_example(idx: int) -> ls_schemas.Example: - return ls_schemas.Example( - id=uuid.uuid4(), - inputs={"in": idx}, - outputs={"answer": idx + 1}, - dataset_id=ds_id, - created_at=datetime.now(timezone.utc), - ) - SPLIT_SIZE = 3 NUM_REPETITIONS = 4 ds_examples = [_create_example(i) for i in range(10)] @@ -196,13 +198,11 @@ def score_value_first(run, example): ordering_of_stuff.append("evaluate") return {"score": 0.3} - def score_unpacked_inputs_outputs(inputs: dict, outputs: dict): + def score_unpacked_inputs_outputs(inputs, outputs): ordering_of_stuff.append("evaluate") return {"score": outputs["output"]} - def score_unpacked_inputs_outputs_reference( - inputs: dict, outputs: dict, reference_outputs: dict - ): + def score_unpacked_inputs_outputs_reference(inputs, outputs, reference_outputs): ordering_of_stuff.append("evaluate") return {"score": reference_outputs["answer"]} @@ -347,15 +347,6 @@ async def test_aevaluate_results(blocking: bool, as_runnable: bool) -> None: ds_name = "my-dataset" ds_id = "00886375-eb2a-4038-9032-efff60309896" - def _create_example(idx: int) -> ls_schemas.Example: - return ls_schemas.Example( - id=uuid.uuid4(), - inputs={"in": idx}, - outputs={"answer": idx + 1}, - dataset_id=ds_id, - created_at=datetime.now(timezone.utc), - ) - SPLIT_SIZE = 3 NUM_REPETITIONS = 4 ds_examples = [_create_example(i) for i in range(10)] @@ -416,12 +407,12 @@ async def score_value_first(run, example): ordering_of_stuff.append("evaluate") return {"score": 0.3} - async def score_unpacked_inputs_outputs(inputs: dict, outputs: dict): + async def score_unpacked_inputs_outputs(inputs, outputs): ordering_of_stuff.append("evaluate") return {"score": outputs["output"]} async def score_unpacked_inputs_outputs_reference( - inputs: dict, outputs: dict, reference_outputs: dict + inputs, outputs, reference_outputs ): ordering_of_stuff.append("evaluate") return {"score": reference_outputs["answer"]} @@ -531,3 +522,58 @@ async def bad_eval_list(run, example): ) async for r in results: assert r["evaluation_results"]["results"][0].extra == {"error": True} + + +@pytest.mark.parametrize("sync", [True, False]) +async def test_evaluate_invalid_evaluator_signature(sync: bool) -> None: + ds_name = "my-dataset" + ds_id = "00886375-eb2a-4038-9032-efff60309896" + + session = mock.Mock() + tenant_id = str(uuid.uuid4()) + ds_examples = [_create_example(0)] + fake_request = FakeRequest(ds_id, ds_name, ds_examples, tenant_id) + session.request = fake_request.request + client = Client( + api_url="http://localhost:1984", + api_key="123", + session=session, + info=ls_schemas.LangSmithInfo( + batch_ingest_config=ls_schemas.BatchIngestConfig( + size_limit_bytes=None, # Note this field is not used here + size_limit=100, + scale_up_nthreads_limit=16, + scale_up_qsize_trigger=1000, + scale_down_nempty_trigger=4, + ) + ), + ) + client._tenant_id = tenant_id # type: ignore + + # args need to be positional + async def eval1(*, inputs, outputs): + pass + + # if more than 2 positional args, they must all have default arg names + # (run, example, ...) + async def eval2(x, y, inputs): + pass + + evaluators = [eval1, eval2] + + async def atarget(x): + return x + + for eval_ in evaluators: + with pytest.raises(ValueError, match="Invalid evaluator function."): + _normalize_evaluator_func(eval_) + + with pytest.raises(ValueError, match="Invalid evaluator function."): + if sync: + evaluate( + (lambda x: x), data=ds_examples, evaluators=[eval_], client=client + ) + else: + await aevaluate( + atarget, data=ds_examples, evaluators=[eval_], client=client + )