From 9bd5969dd83f0568d71a6ff90d110e25d6fb723e Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Wed, 20 Nov 2024 10:53:20 -0800 Subject: [PATCH 1/9] wip --- python/langsmith/client.py | 14 +- python/langsmith/evaluation/evaluator.py | 11 +- python/tests/integration_tests/test_client.py | 132 ++++++++++-- .../unit_tests/evaluation/test_runner.py | 191 ++++++++++++++---- 4 files changed, 286 insertions(+), 62 deletions(-) diff --git a/python/langsmith/client.py b/python/langsmith/client.py index f1213d939..1976259c1 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -122,6 +122,16 @@ class ZoneInfo: # type: ignore[no-redef] URLLIB3_SUPPORTS_BLOCKSIZE = "key_blocksize" in signature(PoolKey).parameters +class AutoSeekBytesIO(io.BytesIO): + """BytesIO class that resets on read.""" + + def read(self, *args, **kwargs): + """Reset on read.""" + data = super().read(*args, **kwargs) + self.seek(0) + return data + + def _parse_token_or_url( url_or_token: Union[str, uuid.UUID], api_url: str, @@ -3672,7 +3682,7 @@ def read_example( for key, value in example["attachment_urls"].items(): response = requests.get(value["presigned_url"], stream=True) response.raise_for_status() - reader = io.BytesIO(response.content) + reader = AutoSeekBytesIO(response.content) attachment_urls[key.split(".")[1]] = ( value["presigned_url"], reader, @@ -3759,7 +3769,7 @@ def list_examples( for key, value in example["attachment_urls"].items(): response = requests.get(value["presigned_url"], stream=True) response.raise_for_status() - reader = io.BytesIO(response.content) + reader = AutoSeekBytesIO(response.content) attachment_urls[key.split(".")[1]] = ( value["presigned_url"], reader, diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index feb0e95e4..e2e898f2c 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -657,7 +657,14 @@ def _normalize_evaluator_func( Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT], Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]], ]: - supported_args = ("run", "example", "inputs", "outputs", "reference_outputs") + supported_args = ( + "run", + "example", + "inputs", + "outputs", + "reference_outputs", + "attachments", + ) sig = inspect.signature(func) positional_args = [ pname @@ -691,6 +698,7 @@ async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: "inputs": example.inputs, "outputs": run.outputs or {}, "reference_outputs": example.outputs or {}, + "attachments": example.attachment_urls or {}, } args = (arg_map[arg] for arg in positional_args) return await func(*args) @@ -711,6 +719,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: "inputs": example.inputs, "outputs": run.outputs or {}, "reference_outputs": example.outputs or {}, + "attachments": example.attachment_urls or {}, } args = (arg_map[arg] for arg in positional_args) return func(*args) diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index cfd848e01..9b85fa480 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -20,7 +20,7 @@ from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor from langsmith.client import ID_TYPE, Client -from langsmith.evaluation import evaluate +from langsmith.evaluation import aevaluate, evaluate from langsmith.schemas import DataType, Example, ExampleUpsertWithAttachments, Run from langsmith.utils import ( LangSmithConnectionError, @@ -1122,9 +1122,6 @@ def create_encoder(*args, **kwargs): assert not caplog.records -@pytest.mark.skip( - reason="Need to land https://github.com/langchain-ai/langsmith-sdk/pull/1209 first" -) def test_list_examples_attachments_keys(langchain_client: Client) -> None: """Test list_examples returns same keys with and without attachments.""" dataset_name = "__test_list_examples_attachments" + uuid4().hex[:4] @@ -1160,20 +1157,18 @@ def test_list_examples_attachments_keys(langchain_client: Client) -> None: langchain_client.delete_dataset(dataset_id=dataset.id) -@pytest.mark.skip( - reason="Need to land https://github.com/langchain-ai/langsmith-sdk/pull/1209 first" -) def test_evaluate_with_attachments(langchain_client: Client) -> None: """Test evaluating examples with attachments.""" dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4] - # 1. Create dataset + langchain_client = Client( + api_key="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", + ) dataset = langchain_client.create_dataset( dataset_name, description="Test dataset for evals with attachments", data_type=DataType.kv, ) - # 2. Create example with attachments example = ExampleUpsertWithAttachments( dataset_id=dataset.id, inputs={"question": "What is shown in the image?"}, @@ -1185,7 +1180,6 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None: langchain_client.upsert_examples_multipart(upserts=[example]) - # 3. Define target function that uses attachments def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]: # Verify we receive the attachment data assert "image" in attachments @@ -1193,25 +1187,26 @@ def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any assert image_data.read() == b"fake image data for testing" return {"answer": "test image"} - # 4. Define simple evaluator - def evaluator(run: Run, example: Example) -> Dict[str, Any]: + def evaluator( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + image_url, image_data = attachments["image"] + assert image_data.read() == b"fake image data for testing" return { "score": float( - run.outputs.get("answer") == example.outputs.get("answer") # type: ignore + reference_outputs.get("answer") == outputs.get("answer") # type: ignore ) } - # 5. Run evaluation results = evaluate( target, data=dataset_name, evaluators=[evaluator], client=langchain_client ) - # 6. Verify results assert len(results) == 1 for result in results: assert result["evaluation_results"]["results"][0].score == 1.0 - # Cleanup langchain_client.delete_dataset(dataset_name=dataset_name) @@ -1261,3 +1256,108 @@ def evaluator(run: Run, example: Example) -> Dict[str, Any]: assert result["evaluation_results"]["results"][0].score == 1.0 langchain_client.delete_dataset(dataset_name=dataset_name) + + +async def test_aevaluate_with_attachments(langchain_client: Client) -> None: + """Test evaluating examples with attachments.""" + dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4] + langchain_client = Client( + api_key="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", + ) + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals with attachments", + data_type=DataType.kv, + ) + + example = ExampleUpsertWithAttachments( + dataset_id=dataset.id, + inputs={"question": "What is shown in the image?"}, + outputs={"answer": "test image"}, + attachments={ + "image": ("image/png", b"fake image data for testing"), + }, + ) + + langchain_client.upsert_examples_multipart(upserts=[example]) + + async def target( + inputs: Dict[str, Any], attachments: Dict[str, Any] + ) -> Dict[str, Any]: + # Verify we receive the attachment data + assert "image" in attachments + image_url, image_data = attachments["image"] + assert image_data.read() == b"fake image data for testing" + return {"answer": "test image"} + + async def evaluator( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + image_url, image_data = attachments["image"] + assert image_data.read() == b"fake image data for testing" + return { + "score": float( + reference_outputs.get("answer") == outputs.get("answer") # type: ignore + ) + } + + results = await aevaluate( + target, data=dataset_name, evaluators=[evaluator], client=langchain_client + ) + + assert len(results) == 1 + async for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + +async def test_aevaluate_with_no_attachments(langchain_client: Client) -> None: + """Test evaluating examples without attachments using a target with attachments.""" + dataset_name = "__test_aevaluate_no_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals without attachments", + data_type=DataType.kv, + ) + + # Create example using old way, attachments should be set to {} + langchain_client.create_example( + dataset_id=dataset.id, + inputs={"question": "What is 2+2?"}, + outputs={"answer": "4"}, + ) + + # Verify we can create example the new way without attachments + example = ExampleUpsertWithAttachments( + dataset_id=dataset.id, + inputs={"question": "What is 3+1?"}, + outputs={"answer": "4"}, + ) + langchain_client.upsert_examples_multipart(upserts=[example]) + + async def target( + inputs: Dict[str, Any], attachments: Dict[str, Any] + ) -> Dict[str, Any]: + # Verify we receive an empty attachments dict + assert isinstance(attachments, dict) + assert len(attachments) == 0 + return {"answer": "4"} + + async def evaluator(run: Run, example: Example) -> Dict[str, Any]: + return { + "score": float( + run.outputs.get("answer") == example.outputs.get("answer") # type: ignore + ) + } + + results = await aevaluate( + target, data=dataset_name, evaluators=[evaluator], client=langchain_client + ) + + assert len(results) == 2 + async for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py index 6fb5a1739..413422523 100644 --- a/python/tests/unit_tests/evaluation/test_runner.py +++ b/python/tests/unit_tests/evaluation/test_runner.py @@ -208,6 +208,14 @@ def score_unpacked_inputs_outputs_reference(inputs, outputs, reference_outputs): ordering_of_stuff.append("evaluate") return {"score": reference_outputs["answer"]} + def score_unpacked_inputs_outputs_attachments(inputs, outputs, attachments): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + + def score_unpacked_outputs(outputs): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + def eval_float(run, example): ordering_of_stuff.append("evaluate") return 0.2 @@ -227,6 +235,8 @@ def eval_list(run, example): score_value_first, score_unpacked_inputs_outputs, score_unpacked_inputs_outputs_reference, + score_unpacked_inputs_outputs_attachments, + score_unpacked_outputs, eval_float, eval_str, eval_list, @@ -313,25 +323,6 @@ def bad_eval_list(run, example): for r in results: assert r["evaluation_results"]["results"][0].extra == {"error": True} - # test invalid evaluators - # args need to be positional - def eval1(*, inputs, outputs): - pass - - # if more than 2 positional args, they must all have default arg names - # (run, example, ...) - def eval2(x, y, inputs): - pass - - evaluators = [eval1, eval2] - - for eval_ in evaluators: - with pytest.raises(ValueError, match="Invalid evaluator function."): - _normalize_evaluator_func(eval_) - - with pytest.raises(ValueError, match="Invalid evaluator function."): - evaluate((lambda x: x), data=ds_examples, evaluators=[eval_], client=client) - def test_evaluate_raises_for_async(): async def my_func(inputs: dict): @@ -439,6 +430,14 @@ async def score_unpacked_inputs_outputs_reference( ordering_of_stuff.append("evaluate") return {"score": reference_outputs["answer"]} + async def score_unpacked_inputs_outputs_attachments(inputs, outputs, attachments): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + + async def score_unpacked_outputs(outputs): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + async def eval_float(run, example): ordering_of_stuff.append("evaluate") return 0.2 @@ -458,6 +457,8 @@ async def eval_list(run, example): score_value_first, score_unpacked_inputs_outputs, score_unpacked_inputs_outputs_reference, + score_unpacked_inputs_outputs_attachments, + score_unpacked_outputs, eval_float, eval_str, eval_list, @@ -545,30 +546,6 @@ async def bad_eval_list(run, example): async for r in results: assert r["evaluation_results"]["results"][0].extra == {"error": True} - # test invalid evaluators - # args need to be positional - async def eval1(*, inputs, outputs): - pass - - # if more than 2 positional args, they must all have default arg names - # (run, example, ...) - async def eval2(x, y, inputs): - pass - - evaluators = [eval1, eval2] - - async def atarget(x): - return x - - for eval_ in evaluators: - with pytest.raises(ValueError, match="Invalid evaluator function."): - _normalize_evaluator_func(eval_) - - with pytest.raises(ValueError, match="Invalid evaluator function."): - await aevaluate( - atarget, data=ds_examples, evaluators=[eval_], client=client - ) - @as_runnable def nested_predict(inputs): @@ -647,3 +624,131 @@ def test_include_attachments(target, expected, error_msg): else: result = _include_attachments(target) assert result == expected + + +def valid_single_supported(inputs, *, optional=None): + return {"score": 1} + + +async def valid_single_supported_async(inputs, *, optional=None): + return {"score": 1} + + +def valid_two_arbitrary(foo, bar, *, optional=None): + return {"score": 1} + + +async def valid_two_arbitrary_async(foo, bar, *, optional=None): + return {"score": 1} + + +def valid_multiple_supported(inputs, outputs, reference_outputs, *, optional=None): + return {"score": 1} + + +async def valid_multiple_supported_async( + inputs, outputs, reference_outputs, *, optional=None +): + return {"score": 1} + + +def invalid_single_unsupported(foo, *, optional=None): + return {"score": 1} + + +async def invalid_single_unsupported_async(foo, *, optional=None): + return {"score": 1} + + +def invalid_three_args(inputs, outputs, foo, *, optional=None): + return {"score": 1} + + +async def invalid_three_args_async(inputs, outputs, foo, *, optional=None): + return {"score": 1} + + +def invalid_no_positional(*, inputs, outputs, optional=None): + return {"score": 1} + + +async def invalid_no_positional_async(*, inputs, outputs, optional=None): + return {"score": 1} + + +# Test cases that should succeed +VALID_EVALUATOR_CASES = [ + (valid_single_supported, False), + (valid_single_supported_async, True), + (valid_two_arbitrary, False), + (valid_two_arbitrary_async, True), + (valid_multiple_supported, False), + (valid_multiple_supported_async, True), +] + +# Test cases that should raise ValueError +INVALID_EVALUATOR_CASES = [ + (invalid_single_unsupported, False), + (invalid_single_unsupported_async, True), + (invalid_three_args, False), + (invalid_three_args_async, True), + (invalid_no_positional, False), + (invalid_no_positional_async, True), +] + + +def target(inputs, attachments): + return {"foo": "bar"} + + +async def atarget(inputs, attachments): + return {"foo": "bar"} + + +@pytest.mark.parametrize("func,is_async", VALID_EVALUATOR_CASES) +def test_normalize_evaluator_func_valid(func, is_async): + """Test _normalize_evaluator_func succeeds.""" + func = _normalize_evaluator_func(func) + session = mock.Mock() + ds_name = "my-dataset" + ds_id = "00886375-eb2a-4038-9032-efff60309896" + + ds_examples = [_create_example(i) for i in range(10)] + tenant_id = str(uuid.uuid4()) + fake_request = FakeRequest(ds_id, ds_name, ds_examples, tenant_id) + session.request = fake_request.request + client = Client(api_url="http://localhost:1984", api_key="123", session=session) + client._tenant_id = tenant_id # type: ignore + + if is_async: + asyncio.run( + aevaluate(atarget, data=ds_examples, evaluators=[func], client=client) + ) + else: + evaluate(target, data=ds_examples, evaluators=[func], client=client) + + +@pytest.mark.parametrize("func,is_async", INVALID_EVALUATOR_CASES) +def test_normalize_evaluator_func_invalid(func, is_async): + """Test _normalize_evaluator_func fails correctly.""" + with pytest.raises(ValueError, match="Invalid evaluator function"): + _normalize_evaluator_func(func) + + session = mock.Mock() + ds_name = "my-dataset" + ds_id = "00886375-eb2a-4038-9032-efff60309896" + + ds_examples = [_create_example(i) for i in range(10)] + tenant_id = str(uuid.uuid4()) + fake_request = FakeRequest(ds_id, ds_name, ds_examples, tenant_id) + session.request = fake_request.request + client = Client(api_url="http://localhost:1984", api_key="123", session=session) + client._tenant_id = tenant_id # type: ignore + + with pytest.raises(ValueError, match="Invalid evaluator function"): + if is_async: + asyncio.run( + aevaluate(atarget, data=ds_examples, evaluators=[func], client=client) + ) + else: + evaluate(target, data=ds_examples, evaluators=[func], client=client) From a72a268ffd02d9044dd0f332f5b2c51daa126aef Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Wed, 20 Nov 2024 11:05:06 -0800 Subject: [PATCH 2/9] rip keys --- python/tests/integration_tests/test_client.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index 9b85fa480..03e816e0a 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -1160,9 +1160,6 @@ def test_list_examples_attachments_keys(langchain_client: Client) -> None: def test_evaluate_with_attachments(langchain_client: Client) -> None: """Test evaluating examples with attachments.""" dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4] - langchain_client = Client( - api_key="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", - ) dataset = langchain_client.create_dataset( dataset_name, description="Test dataset for evals with attachments", @@ -1261,9 +1258,6 @@ def evaluator(run: Run, example: Example) -> Dict[str, Any]: async def test_aevaluate_with_attachments(langchain_client: Client) -> None: """Test evaluating examples with attachments.""" dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4] - langchain_client = Client( - api_key="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", - ) dataset = langchain_client.create_dataset( dataset_name, description="Test dataset for evals with attachments", From 7307836aa7a01ba75dc5480fad213193f1830d57 Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Fri, 6 Dec 2024 15:52:59 -0800 Subject: [PATCH 3/9] changes --- python/langsmith/client.py | 14 +-- python/langsmith/evaluation/_arunner.py | 16 ++- python/langsmith/evaluation/_runner.py | 14 ++- python/tests/integration_tests/test_client.py | 115 ++++++++++++++++-- .../unit_tests/evaluation/test_runner.py | 3 +- 5 files changed, 137 insertions(+), 25 deletions(-) diff --git a/python/langsmith/client.py b/python/langsmith/client.py index f052681cd..4e32188e6 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -140,16 +140,6 @@ class ZoneInfo: # type: ignore[no-redef] URLLIB3_SUPPORTS_BLOCKSIZE = "key_blocksize" in signature(PoolKey).parameters -class AutoSeekBytesIO(io.BytesIO): - """BytesIO class that resets on read.""" - - def read(self, *args, **kwargs): - """Reset on read.""" - data = super().read(*args, **kwargs) - self.seek(0) - return data - - def _parse_token_or_url( url_or_token: Union[str, uuid.UUID], api_url: str, @@ -3835,7 +3825,7 @@ def read_example( for key, value in example["attachment_urls"].items(): response = requests.get(value["presigned_url"], stream=True) response.raise_for_status() - reader = AutoSeekBytesIO(response.content) + reader = io.BytesIO(response.content) attachment_urls[key.split(".")[1]] = ( value["presigned_url"], reader, @@ -3922,7 +3912,7 @@ def list_examples( for key, value in example["attachment_urls"].items(): response = requests.get(value["presigned_url"], stream=True) response.raise_for_status() - reader = AutoSeekBytesIO(response.content) + reader = io.BytesIO(response.content) attachment_urls[key.split(".")[1]] = ( value["presigned_url"], reader, diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py index 9412bf5f3..132ab4fcb 100644 --- a/python/langsmith/evaluation/_arunner.py +++ b/python/langsmith/evaluation/_arunner.py @@ -476,7 +476,8 @@ async def _aevaluate( description=description, num_repetitions=num_repetitions, runs=runs, - include_attachments=_include_attachments(target), + include_attachments=_include_attachments(target) + or _evaluators_include_attachments(evaluators), upload_results=upload_results, ).astart() cache_dir = ls_utils.get_cache_dir(None) @@ -1054,6 +1055,19 @@ def _get_run(r: run_trees.RunTree) -> None: ) +def _evaluators_include_attachments( + evaluators: Sequence[Union[EVALUATOR_T, AEVALUATOR_T]], +) -> bool: + return any( + any( + p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD) + and p.name == "attachments" + for p in inspect.signature(e.__call__).parameters.values() + ) + for e in evaluators + ) + + def _include_attachments( target: Union[ATARGET_T, Iterable[schemas.Run], AsyncIterable[dict], Runnable], ) -> bool: diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index ebd259e14..ff0d3def3 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -1064,7 +1064,8 @@ def _evaluate( # If provided, we don't need to create a new experiment. runs=runs, # Create or resolve the experiment. - include_attachments=_include_attachments(target), + include_attachments=_include_attachments(target) + or _evaluators_include_attachments(evaluators), upload_results=upload_results, ).start() cache_dir = ls_utils.get_cache_dir(None) @@ -1913,6 +1914,17 @@ def _ensure_traceable( return fn +def _evaluators_include_attachments(evaluators: Sequence[EVALUATOR_T]) -> bool: + return any( + any( + p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD) + and p.name == "attachments" + for p in inspect.signature(e.__call__).parameters.values() + ) + for e in evaluators + ) + + def _include_attachments( target: Union[TARGET_T, Iterable[schemas.Run], Runnable], ) -> bool: diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index 79d02e60f..ed136a0bb 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -1265,14 +1265,13 @@ def test_list_examples_attachments_keys(langchain_client: Client) -> None: def test_evaluate_with_attachments(langchain_client: Client) -> None: """Test evaluating examples with attachments.""" dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4] - dataset = langchain_client.create_dataset( dataset_name, description="Test dataset for evals with attachments", data_type=DataType.kv, ) - example = ExampleUpsertWithAttachments( + example = ExampleUploadWithAttachments( dataset_id=dataset.id, inputs={"question": "What is shown in the image?"}, outputs={"answer": "test image"}, @@ -1281,7 +1280,7 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None: }, ) - langchain_client.upsert_examples_multipart(upserts=[example]) + langchain_client.upload_examples_multipart(uploads=[example]) def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]: # Verify we receive the attachment data @@ -1302,11 +1301,60 @@ def evaluator( ) } - results = evaluate( + results = langchain_client.evaluate( + target, + data=dataset_name, + evaluators=[evaluator], + num_repetitions=2, + ) + + assert len(results) == 2 + for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + +def test_evaluate_with_attachments_not_in_target(langchain_client: Client) -> None: + """Test evaluating examples with attachments.""" + dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals with attachments", + data_type=DataType.kv, + ) + + example = ExampleUploadWithAttachments( + dataset_id=dataset.id, + inputs={"question": "What is shown in the image?"}, + outputs={"answer": "test image"}, + attachments={ + "image": ("image/png", b"fake image data for testing"), + }, + ) + + langchain_client.upload_examples_multipart(uploads=[example]) + + def target(inputs: Dict[str, Any]) -> Dict[str, Any]: + # Verify we receive the attachment data + return {"answer": "test image"} + + def evaluator( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + image_url, image_data = attachments["image"] + assert image_data.read() == b"fake image data for testing" + return { + "score": float( + reference_outputs.get("answer") == outputs.get("answer") # type: ignore + ) + } + + results = langchain_client.evaluate( target, data=dataset_name, evaluators=[evaluator], - client=langchain_client, num_repetitions=2, ) @@ -1406,11 +1454,60 @@ async def evaluator( ) } - results = await aevaluate( - target, data=dataset_name, evaluators=[evaluator], client=langchain_client + results = await langchain_client.aevaluate( + target, data=dataset_name, evaluators=[evaluator], num_repetitions=2 + ) + + assert len(results) == 2 + async for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + +async def test_aevaluate_with_attachments_not_in_target( + langchain_client: Client, +) -> None: + """Test evaluating examples with attachments.""" + dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals with attachments", + data_type=DataType.kv, + ) + + example = ExampleUpsertWithAttachments( + dataset_id=dataset.id, + inputs={"question": "What is shown in the image?"}, + outputs={"answer": "test image"}, + attachments={ + "image": ("image/png", b"fake image data for testing"), + }, + ) + + langchain_client.upsert_examples_multipart(upserts=[example]) + + async def target(inputs: Dict[str, Any]) -> Dict[str, Any]: + # Verify we receive the attachment data + return {"answer": "test image"} + + async def evaluator( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + image_url, image_data = attachments["image"] + assert image_data.read() == b"fake image data for testing" + return { + "score": float( + reference_outputs.get("answer") == outputs.get("answer") # type: ignore + ) + } + + results = await langchain_client.aevaluate( + target, data=dataset_name, evaluators=[evaluator], num_repetitions=2 ) - assert len(results) == 1 + assert len(results) == 2 async for result in results: assert result["evaluation_results"]["results"][0].score == 1.0 @@ -1465,7 +1562,7 @@ async def evaluator(run: Run, example: Example) -> Dict[str, Any]: assert result["evaluation_results"]["results"][0].score == 1.0 langchain_client.delete_dataset(dataset_name=dataset_name) - + def test_examples_length_validation(langchain_client: Client) -> None: """Test that mismatched lengths raise ValueError for create and update examples.""" diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py index ab65d7b24..1127c05c0 100644 --- a/python/tests/unit_tests/evaluation/test_runner.py +++ b/python/tests/unit_tests/evaluation/test_runner.py @@ -653,7 +653,6 @@ async def bad_eval_list(run, example): async for r in results: assert r["evaluation_results"]["results"][0].extra == {"error": True} - # test invalid evaluators # args need to be positional async def eval1(*, inputs, outputs): @@ -911,7 +910,7 @@ def test_normalize_evaluator_func_invalid(func, is_async): ) else: evaluate(target, data=ds_examples, evaluators=[func], client=client) - + def summary_eval_runs_examples(runs_, examples_): return {"score": len(runs_[0].dotted_order)} From cf53bbe9cf401ab69b68557a8674ae5bd8e1de44 Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Fri, 6 Dec 2024 16:25:46 -0800 Subject: [PATCH 4/9] fmt --- python/langsmith/evaluation/_arunner.py | 13 +++++++++++-- python/langsmith/evaluation/_runner.py | 14 ++++++++++++-- python/langsmith/evaluation/evaluator.py | 2 +- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py index 132ab4fcb..a84556dc6 100644 --- a/python/langsmith/evaluation/_arunner.py +++ b/python/langsmith/evaluation/_arunner.py @@ -259,6 +259,7 @@ async def aevaluate( ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... + .. versionchanged:: 0.2.0 'max_concurrency' default updated from None (no limit on concurrency) @@ -1056,13 +1057,21 @@ def _get_run(r: run_trees.RunTree) -> None: def _evaluators_include_attachments( - evaluators: Sequence[Union[EVALUATOR_T, AEVALUATOR_T]], + evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]], ) -> bool: + if evaluators is None: + return False return any( any( p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD) and p.name == "attachments" - for p in inspect.signature(e.__call__).parameters.values() + for p in ( + inspect.signature( + e.__call__ if hasattr(e, "__call__") else e + ).parameters.values() + if callable(e) or hasattr(e, "__call__") + else [] + ) ) for e in evaluators ) diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index ff0d3def3..8d42d1847 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -1914,12 +1914,22 @@ def _ensure_traceable( return fn -def _evaluators_include_attachments(evaluators: Sequence[EVALUATOR_T]) -> bool: +def _evaluators_include_attachments( + evaluators: Optional[Sequence[EVALUATOR_T]], +) -> bool: + if evaluators is None: + return False return any( any( p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD) and p.name == "attachments" - for p in inspect.signature(e.__call__).parameters.values() + for p in ( + inspect.signature( + e.__call__ if hasattr(e, "__call__") else e + ).parameters.values() + if callable(e) or hasattr(e, "__call__") + else [] + ) ) for e in evaluators ) diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 0ebb7858e..13aa8c41e 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -666,7 +666,7 @@ async def awrapper( "example": example, "inputs": example.inputs if example else {}, "outputs": run.outputs or {}, - "attachments": example.attachment_urls or {}, + "attachments": example.attachment_urls or {} if example else {}, "reference_outputs": example.outputs or {} if example else {}, } args = (arg_map[arg] for arg in positional_args) From 0cb211869f3e4c6150aaeab41209d523d1d33032 Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Mon, 9 Dec 2024 08:51:23 -0800 Subject: [PATCH 5/9] refactor --- python/langsmith/evaluation/evaluator.py | 4 +-- python/tests/integration_tests/test_client.py | 32 ++++++++++--------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 13aa8c41e..0300de424 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -666,7 +666,7 @@ async def awrapper( "example": example, "inputs": example.inputs if example else {}, "outputs": run.outputs or {}, - "attachments": example.attachment_urls or {} if example else {}, + "attachments": example.attachments_info or {} if example else {}, "reference_outputs": example.outputs or {} if example else {}, } args = (arg_map[arg] for arg in positional_args) @@ -687,7 +687,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: "example": example, "inputs": example.inputs if example else {}, "outputs": run.outputs or {}, - "attachments": example.attachment_urls or {}, + "attachments": example.attachments_info or {}, "reference_outputs": example.outputs or {} if example else {}, } args = (arg_map[arg] for arg in positional_args) diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index 7c756ac5d..e5bc09621 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -1262,12 +1262,12 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None: }, ) - langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]: # Verify we receive the attachment data assert "image" in attachments + assert "presigned_url" in attachments["image"] image_data = attachments["image"]["reader"] assert image_data.read() == b"fake image data for testing" return {"answer": "test image"} @@ -1276,7 +1276,8 @@ def evaluator( outputs: dict, reference_outputs: dict, attachments: dict ) -> Dict[str, Any]: assert "image" in attachments - image_url, image_data = attachments["image"] + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] assert image_data.read() == b"fake image data for testing" return { "score": float( @@ -1326,7 +1327,8 @@ def evaluator( outputs: dict, reference_outputs: dict, attachments: dict ) -> Dict[str, Any]: assert "image" in attachments - image_url, image_data = attachments["image"] + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] assert image_data.read() == b"fake image data for testing" return { "score": float( @@ -1404,8 +1406,7 @@ async def test_aevaluate_with_attachments(langchain_client: Client) -> None: data_type=DataType.kv, ) - example = ExampleUpsertWithAttachments( - dataset_id=dataset.id, + example = ExampleUploadWithAttachments( inputs={"question": "What is shown in the image?"}, outputs={"answer": "test image"}, attachments={ @@ -1413,14 +1414,15 @@ async def test_aevaluate_with_attachments(langchain_client: Client) -> None: }, ) - langchain_client.upsert_examples_multipart(upserts=[example]) + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) async def target( inputs: Dict[str, Any], attachments: Dict[str, Any] ) -> Dict[str, Any]: # Verify we receive the attachment data assert "image" in attachments - image_url, image_data = attachments["image"] + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] assert image_data.read() == b"fake image data for testing" return {"answer": "test image"} @@ -1428,7 +1430,8 @@ async def evaluator( outputs: dict, reference_outputs: dict, attachments: dict ) -> Dict[str, Any]: assert "image" in attachments - image_url, image_data = attachments["image"] + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] assert image_data.read() == b"fake image data for testing" return { "score": float( @@ -1458,8 +1461,7 @@ async def test_aevaluate_with_attachments_not_in_target( data_type=DataType.kv, ) - example = ExampleUpsertWithAttachments( - dataset_id=dataset.id, + example = ExampleUploadWithAttachments( inputs={"question": "What is shown in the image?"}, outputs={"answer": "test image"}, attachments={ @@ -1467,7 +1469,7 @@ async def test_aevaluate_with_attachments_not_in_target( }, ) - langchain_client.upsert_examples_multipart(upserts=[example]) + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) async def target(inputs: Dict[str, Any]) -> Dict[str, Any]: # Verify we receive the attachment data @@ -1477,7 +1479,8 @@ async def evaluator( outputs: dict, reference_outputs: dict, attachments: dict ) -> Dict[str, Any]: assert "image" in attachments - image_url, image_data = attachments["image"] + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] assert image_data.read() == b"fake image data for testing" return { "score": float( @@ -1513,12 +1516,11 @@ async def test_aevaluate_with_no_attachments(langchain_client: Client) -> None: ) # Verify we can create example the new way without attachments - example = ExampleUpsertWithAttachments( - dataset_id=dataset.id, + example = ExampleUploadWithAttachments( inputs={"question": "What is 3+1?"}, outputs={"answer": "4"}, ) - langchain_client.upsert_examples_multipart(upserts=[example]) + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) async def target( inputs: Dict[str, Any], attachments: Dict[str, Any] From 799f69c801db24efb617a31830b8bb22ac969f8a Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Mon, 9 Dec 2024 11:12:25 -0800 Subject: [PATCH 6/9] fmt --- python/langsmith/evaluation/evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 0300de424..13aa8c41e 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -666,7 +666,7 @@ async def awrapper( "example": example, "inputs": example.inputs if example else {}, "outputs": run.outputs or {}, - "attachments": example.attachments_info or {} if example else {}, + "attachments": example.attachment_urls or {} if example else {}, "reference_outputs": example.outputs or {} if example else {}, } args = (arg_map[arg] for arg in positional_args) @@ -687,7 +687,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: "example": example, "inputs": example.inputs if example else {}, "outputs": run.outputs or {}, - "attachments": example.attachments_info or {}, + "attachments": example.attachment_urls or {}, "reference_outputs": example.outputs or {} if example else {}, } args = (arg_map[arg] for arg in positional_args) From cedd8afc455e1a73fb8805f5d09e1945ebfc338a Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Mon, 9 Dec 2024 14:22:30 -0800 Subject: [PATCH 7/9] attachment_urls -> attachments --- python/langsmith/evaluation/evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 13aa8c41e..a1505699a 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -666,7 +666,7 @@ async def awrapper( "example": example, "inputs": example.inputs if example else {}, "outputs": run.outputs or {}, - "attachments": example.attachment_urls or {} if example else {}, + "attachments": example.attachments or {} if example else {}, "reference_outputs": example.outputs or {} if example else {}, } args = (arg_map[arg] for arg in positional_args) @@ -687,7 +687,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: "example": example, "inputs": example.inputs if example else {}, "outputs": run.outputs or {}, - "attachments": example.attachment_urls or {}, + "attachments": example.attachments or {}, "reference_outputs": example.outputs or {} if example else {}, } args = (arg_map[arg] for arg in positional_args) From b99cdc4a60fad88b5b3c2f8a5e3c3ede1a5629a3 Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Mon, 9 Dec 2024 16:11:14 -0800 Subject: [PATCH 8/9] fmt --- python/langsmith/evaluation/_arunner.py | 66 +------------------ python/langsmith/evaluation/_runner.py | 5 +- .../unit_tests/evaluation/test_runner.py | 1 + 3 files changed, 4 insertions(+), 68 deletions(-) diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py index 463ce86ff..7cee6bcf5 100644 --- a/python/langsmith/evaluation/_arunner.py +++ b/python/langsmith/evaluation/_arunner.py @@ -37,6 +37,7 @@ DATA_T, EVALUATOR_T, ExperimentResultRow, + _evaluators_include_attachments, _ExperimentManagerMixin, _extract_feedback_keys, _ForwardResults, @@ -1060,71 +1061,6 @@ def _get_run(r: run_trees.RunTree) -> None: ) -def _evaluators_include_attachments( - evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]], -) -> bool: - if evaluators is None: - return False - return any( - any( - p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD) - and p.name == "attachments" - for p in ( - inspect.signature( - e.__call__ if hasattr(e, "__call__") else e - ).parameters.values() - if callable(e) or hasattr(e, "__call__") - else [] - ) - ) - for e in evaluators - ) - - -def _include_attachments( - target: Union[ATARGET_T, Iterable[schemas.Run], AsyncIterable[dict], Runnable], -) -> bool: - """Whether the target function accepts attachments.""" - if _is_langchain_runnable(target) or not callable(target): - return False - # Check function signature - sig = inspect.signature(target) - params = list(sig.parameters.values()) - positional_params = [ - p - for p in params - if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD) - and p.default is p.empty - ] - - if len(positional_params) == 0: - raise ValueError( - "Target function must accept at least one positional argument (inputs)" - ) - elif len(positional_params) > 2: - raise ValueError( - "Target function must accept at most two positional " - "arguments (inputs, attachments)" - ) - elif len(positional_params) == 2: - mismatches = [] - for i, (p, expected) in enumerate( - zip(positional_params, ("inputs", "attachments")) - ): - if p.name != expected: - mismatches.append((i, p.name)) - - if mismatches: - raise ValueError( - "When target function has two positional arguments, they must be named " - "'inputs' and 'attachments', respectively. Received: " - + ",".join(f"'{p}' at index {i}" for i, p in mismatches) - ) - - return len(positional_params) == 2 - - - def _ensure_async_traceable( target: ATARGET_T, ) -> rh.SupportsLangsmithExtra[[dict], Awaitable]: diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index f01bb1fcb..ddbd9bf18 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -1914,9 +1914,8 @@ def _ensure_traceable( return fn - def _evaluators_include_attachments( - evaluators: Optional[Sequence[EVALUATOR_T]], + evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]], ) -> bool: if evaluators is None: return False @@ -1937,7 +1936,7 @@ def _evaluators_include_attachments( def _include_attachments( - target: Union[TARGET_T, Iterable[schemas.Run], Runnable], + target: Any, ) -> bool: """Whether the target function accepts attachments.""" if _is_langchain_runnable(target) or not callable(target): diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py index e45c2cb02..1863f35b0 100644 --- a/python/tests/unit_tests/evaluation/test_runner.py +++ b/python/tests/unit_tests/evaluation/test_runner.py @@ -421,6 +421,7 @@ def eval2(x, y, inputs): client=client, ) + def test_evaluate_raises_for_async(): async def my_func(inputs: dict): pass From 24d0159e6b534d8a33a07d7ef4b3e487129dfd9b Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Mon, 9 Dec 2024 16:40:25 -0800 Subject: [PATCH 9/9] fmt --- python/tests/unit_tests/evaluation/test_runner.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py index 1863f35b0..e33d07fd5 100644 --- a/python/tests/unit_tests/evaluation/test_runner.py +++ b/python/tests/unit_tests/evaluation/test_runner.py @@ -910,9 +910,12 @@ def test_normalize_evaluator_func_valid(func, is_async): ds_name = "my-dataset" ds_id = "00886375-eb2a-4038-9032-efff60309896" - ds_examples = [_create_example(i) for i in range(10)] + ds_example_responses = [_create_example(i) for i in range(10)] + ds_examples = [e[0] for e in ds_example_responses] tenant_id = str(uuid.uuid4()) - fake_request = FakeRequest(ds_id, ds_name, ds_examples, tenant_id) + fake_request = FakeRequest( + ds_id, ds_name, [e[1] for e in ds_example_responses], tenant_id + ) session.request = fake_request.request client = Client(api_url="http://localhost:1984", api_key="123", session=session) client._tenant_id = tenant_id # type: ignore @@ -935,9 +938,12 @@ def test_normalize_evaluator_func_invalid(func, is_async): ds_name = "my-dataset" ds_id = "00886375-eb2a-4038-9032-efff60309896" - ds_examples = [_create_example(i) for i in range(10)] + ds_example_responses = [_create_example(i) for i in range(10)] + ds_examples = [e[0] for e in ds_example_responses] tenant_id = str(uuid.uuid4()) - fake_request = FakeRequest(ds_id, ds_name, ds_examples, tenant_id) + fake_request = FakeRequest( + ds_id, ds_name, [e[1] for e in ds_example_responses], tenant_id + ) session.request = fake_request.request client = Client(api_url="http://localhost:1984", api_key="123", session=session) client._tenant_id = tenant_id # type: ignore