From c841ec6528513a3bf124c2061940a05975abcb53 Mon Sep 17 00:00:00 2001 From: Isaac Francisco <78627776+isahers1@users.noreply.github.com> Date: Tue, 10 Dec 2024 12:29:22 -0800 Subject: [PATCH] add attachments to evaluate (#1237) --- python/langsmith/evaluation/_arunner.py | 5 +- python/langsmith/evaluation/_runner.py | 28 ++- python/langsmith/evaluation/evaluator.py | 11 +- python/tests/integration_tests/test_client.py | 227 +++++++++++++++++- .../unit_tests/evaluation/test_runner.py | 154 ++++++++++++ 5 files changed, 408 insertions(+), 17 deletions(-) diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py index 59fe06caf..7cee6bcf5 100644 --- a/python/langsmith/evaluation/_arunner.py +++ b/python/langsmith/evaluation/_arunner.py @@ -37,6 +37,7 @@ DATA_T, EVALUATOR_T, ExperimentResultRow, + _evaluators_include_attachments, _ExperimentManagerMixin, _extract_feedback_keys, _ForwardResults, @@ -259,6 +260,7 @@ async def aevaluate( ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... + .. versionchanged:: 0.2.0 'max_concurrency' default updated from None (no limit on concurrency) @@ -476,7 +478,8 @@ async def _aevaluate( description=description, num_repetitions=num_repetitions, runs=runs, - include_attachments=_include_attachments(target), + include_attachments=_include_attachments(target) + or _evaluators_include_attachments(evaluators), upload_results=upload_results, ).astart() cache_dir = ls_utils.get_cache_dir(None) diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index 836aaabe4..ddbd9bf18 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -1064,7 +1064,8 @@ def _evaluate( # If provided, we don't need to create a new experiment. runs=runs, # Create or resolve the experiment. - include_attachments=_include_attachments(target), + include_attachments=_include_attachments(target) + or _evaluators_include_attachments(evaluators), upload_results=upload_results, ).start() cache_dir = ls_utils.get_cache_dir(None) @@ -1913,7 +1914,30 @@ def _ensure_traceable( return fn -def _include_attachments(target: Any) -> bool: +def _evaluators_include_attachments( + evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]], +) -> bool: + if evaluators is None: + return False + return any( + any( + p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD) + and p.name == "attachments" + for p in ( + inspect.signature( + e.__call__ if hasattr(e, "__call__") else e + ).parameters.values() + if callable(e) or hasattr(e, "__call__") + else [] + ) + ) + for e in evaluators + ) + + +def _include_attachments( + target: Any, +) -> bool: """Whether the target function accepts attachments.""" if _is_langchain_runnable(target) or not callable(target): return False diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 02fab3b71..a1505699a 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -624,7 +624,14 @@ def _normalize_evaluator_func( Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT], Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]], ]: - supported_args = ("run", "example", "inputs", "outputs", "reference_outputs") + supported_args = ( + "run", + "example", + "inputs", + "outputs", + "reference_outputs", + "attachments", + ) sig = inspect.signature(func) positional_args = [ pname @@ -659,6 +666,7 @@ async def awrapper( "example": example, "inputs": example.inputs if example else {}, "outputs": run.outputs or {}, + "attachments": example.attachments or {} if example else {}, "reference_outputs": example.outputs or {} if example else {}, } args = (arg_map[arg] for arg in positional_args) @@ -679,6 +687,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: "example": example, "inputs": example.inputs if example else {}, "outputs": run.outputs or {}, + "attachments": example.attachments or {}, "reference_outputs": example.outputs or {} if example else {}, } args = (arg_map[arg] for arg in positional_args) diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index 33eec0f46..f72a2ebdf 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -20,7 +20,7 @@ from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor from langsmith.client import ID_TYPE, Client -from langsmith.evaluation import evaluate +from langsmith.evaluation import aevaluate, evaluate from langsmith.schemas import ( AttachmentsOperations, DataType, @@ -1215,9 +1215,6 @@ def create_encoder(*args, **kwargs): assert not caplog.records -@pytest.mark.skip( - reason="Need to land https://github.com/langchain-ai/langsmith-sdk/pull/1209 first" -) def test_list_examples_attachments_keys(langchain_client: Client) -> None: """Test list_examples returns same keys with and without attachments.""" dataset_name = "__test_list_examples_attachments" + uuid4().hex[:4] @@ -1256,6 +1253,7 @@ def test_list_examples_attachments_keys(langchain_client: Client) -> None: def test_evaluate_with_attachments(langchain_client: Client) -> None: """Test evaluating examples with attachments.""" dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4] + # 1. Create dataset dataset = langchain_client.create_dataset( dataset_name, @@ -1274,37 +1272,89 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None: langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) - # 3. Define target function that uses attachments def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]: # Verify we receive the attachment data assert "image" in attachments + assert "presigned_url" in attachments["image"] image_data = attachments["image"]["reader"] assert image_data.read() == b"fake image data for testing" return {"answer": "test image"} - # 4. Define simple evaluator - def evaluator(run: Run, example: Example) -> Dict[str, Any]: + def evaluator( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] + assert image_data.read() == b"fake image data for testing" return { "score": float( - run.outputs.get("answer") == example.outputs.get("answer") # type: ignore + reference_outputs.get("answer") == outputs.get("answer") # type: ignore ) } - # 5. Run evaluation - results = evaluate( + results = langchain_client.evaluate( + target, + data=dataset_name, + evaluators=[evaluator], + num_repetitions=2, + ) + + assert len(results) == 2 + for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + +def test_evaluate_with_attachments_not_in_target(langchain_client: Client) -> None: + """Test evaluating examples with attachments.""" + dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals with attachments", + data_type=DataType.kv, + ) + + example = ExampleUploadWithAttachments( + dataset_id=dataset.id, + inputs={"question": "What is shown in the image?"}, + outputs={"answer": "test image"}, + attachments={ + "image": ("image/png", b"fake image data for testing"), + }, + ) + + langchain_client.upload_examples_multipart(uploads=[example]) + + def target(inputs: Dict[str, Any]) -> Dict[str, Any]: + # Verify we receive the attachment data + return {"answer": "test image"} + + def evaluator( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] + assert image_data.read() == b"fake image data for testing" + return { + "score": float( + reference_outputs.get("answer") == outputs.get("answer") # type: ignore + ) + } + + results = langchain_client.evaluate( target, data=dataset_name, evaluators=[evaluator], - client=langchain_client, num_repetitions=2, ) - # 6. Verify results assert len(results) == 2 for result in results: assert result["evaluation_results"]["results"][0].score == 1.0 - # Cleanup langchain_client.delete_dataset(dataset_name=dataset_name) @@ -1355,6 +1405,157 @@ def evaluator(run: Run, example: Example) -> Dict[str, Any]: langchain_client.delete_dataset(dataset_name=dataset_name) +async def test_aevaluate_with_attachments(langchain_client: Client) -> None: + """Test evaluating examples with attachments.""" + dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals with attachments", + data_type=DataType.kv, + ) + + example = ExampleUploadWithAttachments( + inputs={"question": "What is shown in the image?"}, + outputs={"answer": "test image"}, + attachments={ + "image": ("image/png", b"fake image data for testing"), + }, + ) + + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) + + async def target( + inputs: Dict[str, Any], attachments: Dict[str, Any] + ) -> Dict[str, Any]: + # Verify we receive the attachment data + assert "image" in attachments + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] + assert image_data.read() == b"fake image data for testing" + return {"answer": "test image"} + + async def evaluator( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] + assert image_data.read() == b"fake image data for testing" + return { + "score": float( + reference_outputs.get("answer") == outputs.get("answer") # type: ignore + ) + } + + results = await langchain_client.aevaluate( + target, data=dataset_name, evaluators=[evaluator], num_repetitions=2 + ) + + assert len(results) == 2 + async for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + +async def test_aevaluate_with_attachments_not_in_target( + langchain_client: Client, +) -> None: + """Test evaluating examples with attachments.""" + dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals with attachments", + data_type=DataType.kv, + ) + + example = ExampleUploadWithAttachments( + inputs={"question": "What is shown in the image?"}, + outputs={"answer": "test image"}, + attachments={ + "image": ("image/png", b"fake image data for testing"), + }, + ) + + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) + + async def target(inputs: Dict[str, Any]) -> Dict[str, Any]: + # Verify we receive the attachment data + return {"answer": "test image"} + + async def evaluator( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] + assert image_data.read() == b"fake image data for testing" + return { + "score": float( + reference_outputs.get("answer") == outputs.get("answer") # type: ignore + ) + } + + results = await langchain_client.aevaluate( + target, data=dataset_name, evaluators=[evaluator], num_repetitions=2 + ) + + assert len(results) == 2 + async for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + +async def test_aevaluate_with_no_attachments(langchain_client: Client) -> None: + """Test evaluating examples without attachments using a target with attachments.""" + dataset_name = "__test_aevaluate_no_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals without attachments", + data_type=DataType.kv, + ) + + # Create example using old way, attachments should be set to {} + langchain_client.create_example( + dataset_id=dataset.id, + inputs={"question": "What is 2+2?"}, + outputs={"answer": "4"}, + ) + + # Verify we can create example the new way without attachments + example = ExampleUploadWithAttachments( + inputs={"question": "What is 3+1?"}, + outputs={"answer": "4"}, + ) + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) + + async def target( + inputs: Dict[str, Any], attachments: Dict[str, Any] + ) -> Dict[str, Any]: + # Verify we receive an empty attachments dict + assert isinstance(attachments, dict) + assert len(attachments) == 0 + return {"answer": "4"} + + async def evaluator(run: Run, example: Example) -> Dict[str, Any]: + return { + "score": float( + run.outputs.get("answer") == example.outputs.get("answer") # type: ignore + ) + } + + results = await aevaluate( + target, data=dataset_name, evaluators=[evaluator], client=langchain_client + ) + + assert len(results) == 2 + async for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + def test_examples_length_validation(langchain_client: Client) -> None: """Test that mismatched lengths raise ValueError for create and update examples.""" dataset_name = "__test_examples_length_validation" + uuid4().hex[:4] diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py index 04b269100..e33d07fd5 100644 --- a/python/tests/unit_tests/evaluation/test_runner.py +++ b/python/tests/unit_tests/evaluation/test_runner.py @@ -242,6 +242,14 @@ def score_unpacked_inputs_outputs_reference(inputs, outputs, reference_outputs): ordering_of_stuff.append("evaluate") return {"score": reference_outputs["answer"]} + def score_unpacked_inputs_outputs_attachments(inputs, outputs, attachments): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + + def score_unpacked_outputs(outputs): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + def eval_float(run, example): ordering_of_stuff.append("evaluate") return 0.2 @@ -270,6 +278,8 @@ def summary_eval_outputs_reference(outputs, reference_outputs): score_value_first, score_unpacked_inputs_outputs, score_unpacked_inputs_outputs_reference, + score_unpacked_inputs_outputs_attachments, + score_unpacked_outputs, eval_float, eval_str, eval_list, @@ -524,6 +534,14 @@ async def score_unpacked_inputs_outputs_reference( ordering_of_stuff.append("evaluate") return {"score": reference_outputs["answer"]} + async def score_unpacked_inputs_outputs_attachments(inputs, outputs, attachments): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + + async def score_unpacked_outputs(outputs): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + async def eval_float(run, example): ordering_of_stuff.append("evaluate") return 0.2 @@ -552,6 +570,8 @@ def summary_eval_outputs_reference(outputs, reference_outputs): score_value_first, score_unpacked_inputs_outputs, score_unpacked_inputs_outputs_reference, + score_unpacked_inputs_outputs_attachments, + score_unpacked_outputs, eval_float, eval_str, eval_list, @@ -803,6 +823,140 @@ def test_include_attachments(target, expected, error_msg, is_async): assert result == expected +def valid_single_supported(inputs, *, optional=None): + return {"score": 1} + + +async def valid_single_supported_async(inputs, *, optional=None): + return {"score": 1} + + +def valid_two_arbitrary(foo, bar, *, optional=None): + return {"score": 1} + + +async def valid_two_arbitrary_async(foo, bar, *, optional=None): + return {"score": 1} + + +def valid_multiple_supported(inputs, outputs, reference_outputs, *, optional=None): + return {"score": 1} + + +async def valid_multiple_supported_async( + inputs, outputs, reference_outputs, *, optional=None +): + return {"score": 1} + + +def invalid_single_unsupported(foo, *, optional=None): + return {"score": 1} + + +async def invalid_single_unsupported_async(foo, *, optional=None): + return {"score": 1} + + +def invalid_three_args(inputs, outputs, foo, *, optional=None): + return {"score": 1} + + +async def invalid_three_args_async(inputs, outputs, foo, *, optional=None): + return {"score": 1} + + +def invalid_no_positional(*, inputs, outputs, optional=None): + return {"score": 1} + + +async def invalid_no_positional_async(*, inputs, outputs, optional=None): + return {"score": 1} + + +# Test cases that should succeed +VALID_EVALUATOR_CASES = [ + (valid_single_supported, False), + (valid_single_supported_async, True), + (valid_two_arbitrary, False), + (valid_two_arbitrary_async, True), + (valid_multiple_supported, False), + (valid_multiple_supported_async, True), +] + +# Test cases that should raise ValueError +INVALID_EVALUATOR_CASES = [ + (invalid_single_unsupported, False), + (invalid_single_unsupported_async, True), + (invalid_three_args, False), + (invalid_three_args_async, True), + (invalid_no_positional, False), + (invalid_no_positional_async, True), +] + + +def target(inputs, attachments): + return {"foo": "bar"} + + +async def atarget(inputs, attachments): + return {"foo": "bar"} + + +@pytest.mark.parametrize("func,is_async", VALID_EVALUATOR_CASES) +def test_normalize_evaluator_func_valid(func, is_async): + """Test _normalize_evaluator_func succeeds.""" + func = _normalize_evaluator_func(func) + session = mock.Mock() + ds_name = "my-dataset" + ds_id = "00886375-eb2a-4038-9032-efff60309896" + + ds_example_responses = [_create_example(i) for i in range(10)] + ds_examples = [e[0] for e in ds_example_responses] + tenant_id = str(uuid.uuid4()) + fake_request = FakeRequest( + ds_id, ds_name, [e[1] for e in ds_example_responses], tenant_id + ) + session.request = fake_request.request + client = Client(api_url="http://localhost:1984", api_key="123", session=session) + client._tenant_id = tenant_id # type: ignore + + if is_async: + asyncio.run( + aevaluate(atarget, data=ds_examples, evaluators=[func], client=client) + ) + else: + evaluate(target, data=ds_examples, evaluators=[func], client=client) + + +@pytest.mark.parametrize("func,is_async", INVALID_EVALUATOR_CASES) +def test_normalize_evaluator_func_invalid(func, is_async): + """Test _normalize_evaluator_func fails correctly.""" + with pytest.raises(ValueError, match="Invalid evaluator function"): + _normalize_evaluator_func(func) + + session = mock.Mock() + ds_name = "my-dataset" + ds_id = "00886375-eb2a-4038-9032-efff60309896" + + ds_example_responses = [_create_example(i) for i in range(10)] + ds_examples = [e[0] for e in ds_example_responses] + tenant_id = str(uuid.uuid4()) + fake_request = FakeRequest( + ds_id, ds_name, [e[1] for e in ds_example_responses], tenant_id + ) + session.request = fake_request.request + client = Client(api_url="http://localhost:1984", api_key="123", session=session) + client._tenant_id = tenant_id # type: ignore + + with pytest.raises(ValueError, match="Invalid evaluator function"): + if is_async: + asyncio.run( + aevaluate(atarget, data=ds_examples, evaluators=[func], client=client) + ) + else: + evaluate(target, data=ds_examples, evaluators=[func], client=client) + + def summary_eval_runs_examples(runs_, examples_): return {"score": len(runs_[0].dotted_order)}