From 9bd5969dd83f0568d71a6ff90d110e25d6fb723e Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Wed, 20 Nov 2024 10:53:20 -0800
Subject: [PATCH 1/9] wip

---
 python/langsmith/client.py                    |  14 +-
 python/langsmith/evaluation/evaluator.py      |  11 +-
 python/tests/integration_tests/test_client.py | 132 ++++++++++--
 .../unit_tests/evaluation/test_runner.py      | 191 ++++++++++++++----
 4 files changed, 286 insertions(+), 62 deletions(-)

diff --git a/python/langsmith/client.py b/python/langsmith/client.py
index f1213d939..1976259c1 100644
--- a/python/langsmith/client.py
+++ b/python/langsmith/client.py
@@ -122,6 +122,16 @@ class ZoneInfo:  # type: ignore[no-redef]
 URLLIB3_SUPPORTS_BLOCKSIZE = "key_blocksize" in signature(PoolKey).parameters
 
 
+class AutoSeekBytesIO(io.BytesIO):
+    """BytesIO class that resets on read."""
+
+    def read(self, *args, **kwargs):
+        """Reset on read."""
+        data = super().read(*args, **kwargs)
+        self.seek(0)
+        return data
+
+
 def _parse_token_or_url(
     url_or_token: Union[str, uuid.UUID],
     api_url: str,
@@ -3672,7 +3682,7 @@ def read_example(
             for key, value in example["attachment_urls"].items():
                 response = requests.get(value["presigned_url"], stream=True)
                 response.raise_for_status()
-                reader = io.BytesIO(response.content)
+                reader = AutoSeekBytesIO(response.content)
                 attachment_urls[key.split(".")[1]] = (
                     value["presigned_url"],
                     reader,
@@ -3759,7 +3769,7 @@ def list_examples(
                 for key, value in example["attachment_urls"].items():
                     response = requests.get(value["presigned_url"], stream=True)
                     response.raise_for_status()
-                    reader = io.BytesIO(response.content)
+                    reader = AutoSeekBytesIO(response.content)
                     attachment_urls[key.split(".")[1]] = (
                         value["presigned_url"],
                         reader,
diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index feb0e95e4..e2e898f2c 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -657,7 +657,14 @@ def _normalize_evaluator_func(
     Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT],
     Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]],
 ]:
-    supported_args = ("run", "example", "inputs", "outputs", "reference_outputs")
+    supported_args = (
+        "run",
+        "example",
+        "inputs",
+        "outputs",
+        "reference_outputs",
+        "attachments",
+    )
     sig = inspect.signature(func)
     positional_args = [
         pname
@@ -691,6 +698,7 @@ async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
                     "inputs": example.inputs,
                     "outputs": run.outputs or {},
                     "reference_outputs": example.outputs or {},
+                    "attachments": example.attachment_urls or {},
                 }
                 args = (arg_map[arg] for arg in positional_args)
                 return await func(*args)
@@ -711,6 +719,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
                     "inputs": example.inputs,
                     "outputs": run.outputs or {},
                     "reference_outputs": example.outputs or {},
+                    "attachments": example.attachment_urls or {},
                 }
                 args = (arg_map[arg] for arg in positional_args)
                 return func(*args)
diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
index cfd848e01..9b85fa480 100644
--- a/python/tests/integration_tests/test_client.py
+++ b/python/tests/integration_tests/test_client.py
@@ -20,7 +20,7 @@
 from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
 
 from langsmith.client import ID_TYPE, Client
-from langsmith.evaluation import evaluate
+from langsmith.evaluation import aevaluate, evaluate
 from langsmith.schemas import DataType, Example, ExampleUpsertWithAttachments, Run
 from langsmith.utils import (
     LangSmithConnectionError,
@@ -1122,9 +1122,6 @@ def create_encoder(*args, **kwargs):
         assert not caplog.records
 
 
-@pytest.mark.skip(
-    reason="Need to land https://github.com/langchain-ai/langsmith-sdk/pull/1209 first"
-)
 def test_list_examples_attachments_keys(langchain_client: Client) -> None:
     """Test list_examples returns same keys with and without attachments."""
     dataset_name = "__test_list_examples_attachments" + uuid4().hex[:4]
@@ -1160,20 +1157,18 @@ def test_list_examples_attachments_keys(langchain_client: Client) -> None:
     langchain_client.delete_dataset(dataset_id=dataset.id)
 
 
-@pytest.mark.skip(
-    reason="Need to land https://github.com/langchain-ai/langsmith-sdk/pull/1209 first"
-)
 def test_evaluate_with_attachments(langchain_client: Client) -> None:
     """Test evaluating examples with attachments."""
     dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4]
-    # 1. Create dataset
+    langchain_client = Client(
+        api_key="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",
+    )
     dataset = langchain_client.create_dataset(
         dataset_name,
         description="Test dataset for evals with attachments",
         data_type=DataType.kv,
     )
 
-    # 2. Create example with attachments
     example = ExampleUpsertWithAttachments(
         dataset_id=dataset.id,
         inputs={"question": "What is shown in the image?"},
@@ -1185,7 +1180,6 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None:
 
     langchain_client.upsert_examples_multipart(upserts=[example])
 
-    # 3. Define target function that uses attachments
     def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]:
         # Verify we receive the attachment data
         assert "image" in attachments
@@ -1193,25 +1187,26 @@ def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any
         assert image_data.read() == b"fake image data for testing"
         return {"answer": "test image"}
 
-    # 4. Define simple evaluator
-    def evaluator(run: Run, example: Example) -> Dict[str, Any]:
+    def evaluator(
+        outputs: dict, reference_outputs: dict, attachments: dict
+    ) -> Dict[str, Any]:
+        assert "image" in attachments
+        image_url, image_data = attachments["image"]
+        assert image_data.read() == b"fake image data for testing"
         return {
             "score": float(
-                run.outputs.get("answer") == example.outputs.get("answer")  # type: ignore
+                reference_outputs.get("answer") == outputs.get("answer")  # type: ignore
             )
         }
 
-    # 5. Run evaluation
     results = evaluate(
         target, data=dataset_name, evaluators=[evaluator], client=langchain_client
     )
 
-    # 6. Verify results
     assert len(results) == 1
     for result in results:
         assert result["evaluation_results"]["results"][0].score == 1.0
 
-    # Cleanup
     langchain_client.delete_dataset(dataset_name=dataset_name)
 
 
@@ -1261,3 +1256,108 @@ def evaluator(run: Run, example: Example) -> Dict[str, Any]:
         assert result["evaluation_results"]["results"][0].score == 1.0
 
     langchain_client.delete_dataset(dataset_name=dataset_name)
+
+
+async def test_aevaluate_with_attachments(langchain_client: Client) -> None:
+    """Test evaluating examples with attachments."""
+    dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4]
+    langchain_client = Client(
+        api_key="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",
+    )
+    dataset = langchain_client.create_dataset(
+        dataset_name,
+        description="Test dataset for evals with attachments",
+        data_type=DataType.kv,
+    )
+
+    example = ExampleUpsertWithAttachments(
+        dataset_id=dataset.id,
+        inputs={"question": "What is shown in the image?"},
+        outputs={"answer": "test image"},
+        attachments={
+            "image": ("image/png", b"fake image data for testing"),
+        },
+    )
+
+    langchain_client.upsert_examples_multipart(upserts=[example])
+
+    async def target(
+        inputs: Dict[str, Any], attachments: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        # Verify we receive the attachment data
+        assert "image" in attachments
+        image_url, image_data = attachments["image"]
+        assert image_data.read() == b"fake image data for testing"
+        return {"answer": "test image"}
+
+    async def evaluator(
+        outputs: dict, reference_outputs: dict, attachments: dict
+    ) -> Dict[str, Any]:
+        assert "image" in attachments
+        image_url, image_data = attachments["image"]
+        assert image_data.read() == b"fake image data for testing"
+        return {
+            "score": float(
+                reference_outputs.get("answer") == outputs.get("answer")  # type: ignore
+            )
+        }
+
+    results = await aevaluate(
+        target, data=dataset_name, evaluators=[evaluator], client=langchain_client
+    )
+
+    assert len(results) == 1
+    async for result in results:
+        assert result["evaluation_results"]["results"][0].score == 1.0
+
+    langchain_client.delete_dataset(dataset_name=dataset_name)
+
+
+async def test_aevaluate_with_no_attachments(langchain_client: Client) -> None:
+    """Test evaluating examples without attachments using a target with attachments."""
+    dataset_name = "__test_aevaluate_no_attachments" + uuid4().hex[:4]
+    dataset = langchain_client.create_dataset(
+        dataset_name,
+        description="Test dataset for evals without attachments",
+        data_type=DataType.kv,
+    )
+
+    # Create example using old way, attachments should be set to {}
+    langchain_client.create_example(
+        dataset_id=dataset.id,
+        inputs={"question": "What is 2+2?"},
+        outputs={"answer": "4"},
+    )
+
+    # Verify we can create example the new way without attachments
+    example = ExampleUpsertWithAttachments(
+        dataset_id=dataset.id,
+        inputs={"question": "What is 3+1?"},
+        outputs={"answer": "4"},
+    )
+    langchain_client.upsert_examples_multipart(upserts=[example])
+
+    async def target(
+        inputs: Dict[str, Any], attachments: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        # Verify we receive an empty attachments dict
+        assert isinstance(attachments, dict)
+        assert len(attachments) == 0
+        return {"answer": "4"}
+
+    async def evaluator(run: Run, example: Example) -> Dict[str, Any]:
+        return {
+            "score": float(
+                run.outputs.get("answer") == example.outputs.get("answer")  # type: ignore
+            )
+        }
+
+    results = await aevaluate(
+        target, data=dataset_name, evaluators=[evaluator], client=langchain_client
+    )
+
+    assert len(results) == 2
+    async for result in results:
+        assert result["evaluation_results"]["results"][0].score == 1.0
+
+    langchain_client.delete_dataset(dataset_name=dataset_name)
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index 6fb5a1739..413422523 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -208,6 +208,14 @@ def score_unpacked_inputs_outputs_reference(inputs, outputs, reference_outputs):
         ordering_of_stuff.append("evaluate")
         return {"score": reference_outputs["answer"]}
 
+    def score_unpacked_inputs_outputs_attachments(inputs, outputs, attachments):
+        ordering_of_stuff.append("evaluate")
+        return {"score": outputs["output"]}
+
+    def score_unpacked_outputs(outputs):
+        ordering_of_stuff.append("evaluate")
+        return {"score": outputs["output"]}
+
     def eval_float(run, example):
         ordering_of_stuff.append("evaluate")
         return 0.2
@@ -227,6 +235,8 @@ def eval_list(run, example):
         score_value_first,
         score_unpacked_inputs_outputs,
         score_unpacked_inputs_outputs_reference,
+        score_unpacked_inputs_outputs_attachments,
+        score_unpacked_outputs,
         eval_float,
         eval_str,
         eval_list,
@@ -313,25 +323,6 @@ def bad_eval_list(run, example):
     for r in results:
         assert r["evaluation_results"]["results"][0].extra == {"error": True}
 
-    # test invalid evaluators
-    # args need to be positional
-    def eval1(*, inputs, outputs):
-        pass
-
-    # if more than 2 positional args, they must all have default arg names
-    # (run, example, ...)
-    def eval2(x, y, inputs):
-        pass
-
-    evaluators = [eval1, eval2]
-
-    for eval_ in evaluators:
-        with pytest.raises(ValueError, match="Invalid evaluator function."):
-            _normalize_evaluator_func(eval_)
-
-        with pytest.raises(ValueError, match="Invalid evaluator function."):
-            evaluate((lambda x: x), data=ds_examples, evaluators=[eval_], client=client)
-
 
 def test_evaluate_raises_for_async():
     async def my_func(inputs: dict):
@@ -439,6 +430,14 @@ async def score_unpacked_inputs_outputs_reference(
         ordering_of_stuff.append("evaluate")
         return {"score": reference_outputs["answer"]}
 
+    async def score_unpacked_inputs_outputs_attachments(inputs, outputs, attachments):
+        ordering_of_stuff.append("evaluate")
+        return {"score": outputs["output"]}
+
+    async def score_unpacked_outputs(outputs):
+        ordering_of_stuff.append("evaluate")
+        return {"score": outputs["output"]}
+
     async def eval_float(run, example):
         ordering_of_stuff.append("evaluate")
         return 0.2
@@ -458,6 +457,8 @@ async def eval_list(run, example):
         score_value_first,
         score_unpacked_inputs_outputs,
         score_unpacked_inputs_outputs_reference,
+        score_unpacked_inputs_outputs_attachments,
+        score_unpacked_outputs,
         eval_float,
         eval_str,
         eval_list,
@@ -545,30 +546,6 @@ async def bad_eval_list(run, example):
     async for r in results:
         assert r["evaluation_results"]["results"][0].extra == {"error": True}
 
-    # test invalid evaluators
-    # args need to be positional
-    async def eval1(*, inputs, outputs):
-        pass
-
-    # if more than 2 positional args, they must all have default arg names
-    # (run, example, ...)
-    async def eval2(x, y, inputs):
-        pass
-
-    evaluators = [eval1, eval2]
-
-    async def atarget(x):
-        return x
-
-    for eval_ in evaluators:
-        with pytest.raises(ValueError, match="Invalid evaluator function."):
-            _normalize_evaluator_func(eval_)
-
-        with pytest.raises(ValueError, match="Invalid evaluator function."):
-            await aevaluate(
-                atarget, data=ds_examples, evaluators=[eval_], client=client
-            )
-
 
 @as_runnable
 def nested_predict(inputs):
@@ -647,3 +624,131 @@ def test_include_attachments(target, expected, error_msg):
     else:
         result = _include_attachments(target)
         assert result == expected
+
+
+def valid_single_supported(inputs, *, optional=None):
+    return {"score": 1}
+
+
+async def valid_single_supported_async(inputs, *, optional=None):
+    return {"score": 1}
+
+
+def valid_two_arbitrary(foo, bar, *, optional=None):
+    return {"score": 1}
+
+
+async def valid_two_arbitrary_async(foo, bar, *, optional=None):
+    return {"score": 1}
+
+
+def valid_multiple_supported(inputs, outputs, reference_outputs, *, optional=None):
+    return {"score": 1}
+
+
+async def valid_multiple_supported_async(
+    inputs, outputs, reference_outputs, *, optional=None
+):
+    return {"score": 1}
+
+
+def invalid_single_unsupported(foo, *, optional=None):
+    return {"score": 1}
+
+
+async def invalid_single_unsupported_async(foo, *, optional=None):
+    return {"score": 1}
+
+
+def invalid_three_args(inputs, outputs, foo, *, optional=None):
+    return {"score": 1}
+
+
+async def invalid_three_args_async(inputs, outputs, foo, *, optional=None):
+    return {"score": 1}
+
+
+def invalid_no_positional(*, inputs, outputs, optional=None):
+    return {"score": 1}
+
+
+async def invalid_no_positional_async(*, inputs, outputs, optional=None):
+    return {"score": 1}
+
+
+# Test cases that should succeed
+VALID_EVALUATOR_CASES = [
+    (valid_single_supported, False),
+    (valid_single_supported_async, True),
+    (valid_two_arbitrary, False),
+    (valid_two_arbitrary_async, True),
+    (valid_multiple_supported, False),
+    (valid_multiple_supported_async, True),
+]
+
+# Test cases that should raise ValueError
+INVALID_EVALUATOR_CASES = [
+    (invalid_single_unsupported, False),
+    (invalid_single_unsupported_async, True),
+    (invalid_three_args, False),
+    (invalid_three_args_async, True),
+    (invalid_no_positional, False),
+    (invalid_no_positional_async, True),
+]
+
+
+def target(inputs, attachments):
+    return {"foo": "bar"}
+
+
+async def atarget(inputs, attachments):
+    return {"foo": "bar"}
+
+
+@pytest.mark.parametrize("func,is_async", VALID_EVALUATOR_CASES)
+def test_normalize_evaluator_func_valid(func, is_async):
+    """Test _normalize_evaluator_func succeeds."""
+    func = _normalize_evaluator_func(func)
+    session = mock.Mock()
+    ds_name = "my-dataset"
+    ds_id = "00886375-eb2a-4038-9032-efff60309896"
+
+    ds_examples = [_create_example(i) for i in range(10)]
+    tenant_id = str(uuid.uuid4())
+    fake_request = FakeRequest(ds_id, ds_name, ds_examples, tenant_id)
+    session.request = fake_request.request
+    client = Client(api_url="http://localhost:1984", api_key="123", session=session)
+    client._tenant_id = tenant_id  # type: ignore
+
+    if is_async:
+        asyncio.run(
+            aevaluate(atarget, data=ds_examples, evaluators=[func], client=client)
+        )
+    else:
+        evaluate(target, data=ds_examples, evaluators=[func], client=client)
+
+
+@pytest.mark.parametrize("func,is_async", INVALID_EVALUATOR_CASES)
+def test_normalize_evaluator_func_invalid(func, is_async):
+    """Test _normalize_evaluator_func fails correctly."""
+    with pytest.raises(ValueError, match="Invalid evaluator function"):
+        _normalize_evaluator_func(func)
+
+    session = mock.Mock()
+    ds_name = "my-dataset"
+    ds_id = "00886375-eb2a-4038-9032-efff60309896"
+
+    ds_examples = [_create_example(i) for i in range(10)]
+    tenant_id = str(uuid.uuid4())
+    fake_request = FakeRequest(ds_id, ds_name, ds_examples, tenant_id)
+    session.request = fake_request.request
+    client = Client(api_url="http://localhost:1984", api_key="123", session=session)
+    client._tenant_id = tenant_id  # type: ignore
+
+    with pytest.raises(ValueError, match="Invalid evaluator function"):
+        if is_async:
+            asyncio.run(
+                aevaluate(atarget, data=ds_examples, evaluators=[func], client=client)
+            )
+        else:
+            evaluate(target, data=ds_examples, evaluators=[func], client=client)

From a72a268ffd02d9044dd0f332f5b2c51daa126aef Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Wed, 20 Nov 2024 11:05:06 -0800
Subject: [PATCH 2/9] rip keys

---
 python/tests/integration_tests/test_client.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
index 9b85fa480..03e816e0a 100644
--- a/python/tests/integration_tests/test_client.py
+++ b/python/tests/integration_tests/test_client.py
@@ -1160,9 +1160,6 @@ def test_list_examples_attachments_keys(langchain_client: Client) -> None:
 def test_evaluate_with_attachments(langchain_client: Client) -> None:
     """Test evaluating examples with attachments."""
     dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4]
-    langchain_client = Client(
-        api_key="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",
-    )
     dataset = langchain_client.create_dataset(
         dataset_name,
         description="Test dataset for evals with attachments",
@@ -1261,9 +1258,6 @@ def evaluator(run: Run, example: Example) -> Dict[str, Any]:
 async def test_aevaluate_with_attachments(langchain_client: Client) -> None:
     """Test evaluating examples with attachments."""
     dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4]
-    langchain_client = Client(
-        api_key="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",
-    )
     dataset = langchain_client.create_dataset(
         dataset_name,
         description="Test dataset for evals with attachments",

From 7307836aa7a01ba75dc5480fad213193f1830d57 Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Fri, 6 Dec 2024 15:52:59 -0800
Subject: [PATCH 3/9] changes

---
 python/langsmith/client.py                    |  14 +--
 python/langsmith/evaluation/_arunner.py       |  16 ++-
 python/langsmith/evaluation/_runner.py        |  14 ++-
 python/tests/integration_tests/test_client.py | 115 ++++++++++++++++--
 .../unit_tests/evaluation/test_runner.py      |   3 +-
 5 files changed, 137 insertions(+), 25 deletions(-)

diff --git a/python/langsmith/client.py b/python/langsmith/client.py
index f052681cd..4e32188e6 100644
--- a/python/langsmith/client.py
+++ b/python/langsmith/client.py
@@ -140,16 +140,6 @@ class ZoneInfo:  # type: ignore[no-redef]
 URLLIB3_SUPPORTS_BLOCKSIZE = "key_blocksize" in signature(PoolKey).parameters
 
 
-class AutoSeekBytesIO(io.BytesIO):
-    """BytesIO class that resets on read."""
-
-    def read(self, *args, **kwargs):
-        """Reset on read."""
-        data = super().read(*args, **kwargs)
-        self.seek(0)
-        return data
-
-
 def _parse_token_or_url(
     url_or_token: Union[str, uuid.UUID],
     api_url: str,
@@ -3835,7 +3825,7 @@ def read_example(
             for key, value in example["attachment_urls"].items():
                 response = requests.get(value["presigned_url"], stream=True)
                 response.raise_for_status()
-                reader = AutoSeekBytesIO(response.content)
+                reader = io.BytesIO(response.content)
                 attachment_urls[key.split(".")[1]] = (
                     value["presigned_url"],
                     reader,
@@ -3922,7 +3912,7 @@ def list_examples(
                 for key, value in example["attachment_urls"].items():
                     response = requests.get(value["presigned_url"], stream=True)
                     response.raise_for_status()
-                    reader = AutoSeekBytesIO(response.content)
+                    reader = io.BytesIO(response.content)
                     attachment_urls[key.split(".")[1]] = (
                         value["presigned_url"],
                         reader,
diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
index 9412bf5f3..132ab4fcb 100644
--- a/python/langsmith/evaluation/_arunner.py
+++ b/python/langsmith/evaluation/_arunner.py
@@ -476,7 +476,8 @@ async def _aevaluate(
         description=description,
         num_repetitions=num_repetitions,
         runs=runs,
-        include_attachments=_include_attachments(target),
+        include_attachments=_include_attachments(target)
+        or _evaluators_include_attachments(evaluators),
         upload_results=upload_results,
     ).astart()
     cache_dir = ls_utils.get_cache_dir(None)
@@ -1054,6 +1055,19 @@ def _get_run(r: run_trees.RunTree) -> None:
         )
 
 
+def _evaluators_include_attachments(
+    evaluators: Sequence[Union[EVALUATOR_T, AEVALUATOR_T]],
+) -> bool:
+    return any(
+        any(
+            p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
+            and p.name == "attachments"
+            for p in inspect.signature(e.__call__).parameters.values()
+        )
+        for e in evaluators
+    )
+
+
 def _include_attachments(
     target: Union[ATARGET_T, Iterable[schemas.Run], AsyncIterable[dict], Runnable],
 ) -> bool:
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index ebd259e14..ff0d3def3 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -1064,7 +1064,8 @@ def _evaluate(
         # If provided, we don't need to create a new experiment.
         runs=runs,
         # Create or resolve the experiment.
-        include_attachments=_include_attachments(target),
+        include_attachments=_include_attachments(target)
+        or _evaluators_include_attachments(evaluators),
         upload_results=upload_results,
     ).start()
     cache_dir = ls_utils.get_cache_dir(None)
@@ -1913,6 +1914,17 @@ def _ensure_traceable(
     return fn
 
 
+def _evaluators_include_attachments(evaluators: Sequence[EVALUATOR_T]) -> bool:
+    return any(
+        any(
+            p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
+            and p.name == "attachments"
+            for p in inspect.signature(e.__call__).parameters.values()
+        )
+        for e in evaluators
+    )
+
+
 def _include_attachments(
     target: Union[TARGET_T, Iterable[schemas.Run], Runnable],
 ) -> bool:
diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
index 79d02e60f..ed136a0bb 100644
--- a/python/tests/integration_tests/test_client.py
+++ b/python/tests/integration_tests/test_client.py
@@ -1265,14 +1265,13 @@ def test_list_examples_attachments_keys(langchain_client: Client) -> None:
 def test_evaluate_with_attachments(langchain_client: Client) -> None:
     """Test evaluating examples with attachments."""
     dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4]
-
     dataset = langchain_client.create_dataset(
         dataset_name,
         description="Test dataset for evals with attachments",
         data_type=DataType.kv,
     )
 
-    example = ExampleUpsertWithAttachments(
+    example = ExampleUploadWithAttachments(
         dataset_id=dataset.id,
         inputs={"question": "What is shown in the image?"},
         outputs={"answer": "test image"},
@@ -1281,7 +1280,7 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None:
         },
     )
 
-    langchain_client.upsert_examples_multipart(upserts=[example])
+    langchain_client.upload_examples_multipart(uploads=[example])
 
     def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]:
         # Verify we receive the attachment data
@@ -1302,11 +1301,60 @@ def evaluator(
             )
         }
 
-    results = evaluate(
+    results = langchain_client.evaluate(
+        target,
+        data=dataset_name,
+        evaluators=[evaluator],
+        num_repetitions=2,
+    )
+
+    assert len(results) == 2
+    for result in results:
+        assert result["evaluation_results"]["results"][0].score == 1.0
+
+    langchain_client.delete_dataset(dataset_name=dataset_name)
+
+
+def test_evaluate_with_attachments_not_in_target(langchain_client: Client) -> None:
+    """Test evaluating examples with attachments."""
+    dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4]
+    dataset = langchain_client.create_dataset(
+        dataset_name,
+        description="Test dataset for evals with attachments",
+        data_type=DataType.kv,
+    )
+
+    example = ExampleUploadWithAttachments(
+        dataset_id=dataset.id,
+        inputs={"question": "What is shown in the image?"},
+        outputs={"answer": "test image"},
+        attachments={
+            "image": ("image/png", b"fake image data for testing"),
+        },
+    )
+
+    langchain_client.upload_examples_multipart(uploads=[example])
+
+    def target(inputs: Dict[str, Any]) -> Dict[str, Any]:
+        # Verify we receive the attachment data
+        return {"answer": "test image"}
+
+    def evaluator(
+        outputs: dict, reference_outputs: dict, attachments: dict
+    ) -> Dict[str, Any]:
+        assert "image" in attachments
+        image_url, image_data = attachments["image"]
+        assert image_data.read() == b"fake image data for testing"
+        return {
+            "score": float(
+                reference_outputs.get("answer") == outputs.get("answer")  # type: ignore
+            )
+        }
+
+    results = langchain_client.evaluate(
         target,
         data=dataset_name,
         evaluators=[evaluator],
-        client=langchain_client,
         num_repetitions=2,
     )
 
@@ -1406,11 +1454,60 @@ async def evaluator(
             )
         }
 
-    results = await aevaluate(
-        target, data=dataset_name, evaluators=[evaluator], client=langchain_client
+    results = await langchain_client.aevaluate(
+        target, data=dataset_name, evaluators=[evaluator], num_repetitions=2
+    )
+
+    assert len(results) == 2
+    async for result in results:
+        assert result["evaluation_results"]["results"][0].score == 1.0
+
+    langchain_client.delete_dataset(dataset_name=dataset_name)
+
+
+async def test_aevaluate_with_attachments_not_in_target(
+    langchain_client: Client,
+) -> None:
+    """Test evaluating examples with attachments."""
+    dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4]
+    dataset = langchain_client.create_dataset(
+        dataset_name,
+        description="Test dataset for evals with attachments",
+        data_type=DataType.kv,
+    )
+
+    example = ExampleUpsertWithAttachments(
+        dataset_id=dataset.id,
+        inputs={"question": "What is shown in the image?"},
+        outputs={"answer": "test image"},
+        attachments={
+            "image": ("image/png", b"fake image data for testing"),
+        },
+    )
+
+    langchain_client.upsert_examples_multipart(upserts=[example])
+
+    async def target(inputs: Dict[str, Any]) -> Dict[str, Any]:
+        # Verify we receive the attachment data
+        return {"answer": "test image"}
+
+    async def evaluator(
+        outputs: dict, reference_outputs: dict, attachments: dict
+    ) -> Dict[str, Any]:
+        assert "image" in attachments
+        image_url, image_data = attachments["image"]
+        assert image_data.read() == b"fake image data for testing"
+        return {
+            "score": float(
+                reference_outputs.get("answer") == outputs.get("answer")  # type: ignore
+            )
+        }
+
+    results = await langchain_client.aevaluate(
+        target, data=dataset_name, evaluators=[evaluator], num_repetitions=2
     )
 
-    assert len(results) == 1
+    assert len(results) == 2
     async for result in results:
         assert result["evaluation_results"]["results"][0].score == 1.0
 
@@ -1465,7 +1562,7 @@ async def evaluator(run: Run, example: Example) -> Dict[str, Any]:
         assert result["evaluation_results"]["results"][0].score == 1.0
 
     langchain_client.delete_dataset(dataset_name=dataset_name)
-    
+
 
 def test_examples_length_validation(langchain_client: Client) -> None:
     """Test that mismatched lengths raise ValueError for create and update examples."""
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index ab65d7b24..1127c05c0 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -653,7 +653,6 @@ async def bad_eval_list(run, example):
     async for r in results:
         assert r["evaluation_results"]["results"][0].extra == {"error": True}
 
-
     # test invalid evaluators
     # args need to be positional
     async def eval1(*, inputs, outputs):
@@ -911,7 +910,7 @@ def test_normalize_evaluator_func_invalid(func, is_async):
             )
         else:
             evaluate(target, data=ds_examples, evaluators=[func], client=client)
-            
+
 
 def summary_eval_runs_examples(runs_, examples_):
     return {"score": len(runs_[0].dotted_order)}

From cf53bbe9cf401ab69b68557a8674ae5bd8e1de44 Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Fri, 6 Dec 2024 16:25:46 -0800
Subject: [PATCH 4/9] fmt

---
 python/langsmith/evaluation/_arunner.py  | 13 +++++++++++--
 python/langsmith/evaluation/_runner.py   | 14 ++++++++++++--
 python/langsmith/evaluation/evaluator.py |  2 +-
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
index 132ab4fcb..a84556dc6 100644
--- a/python/langsmith/evaluation/_arunner.py
+++ b/python/langsmith/evaluation/_arunner.py
@@ -259,6 +259,7 @@ async def aevaluate(
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
 
+
     .. versionchanged:: 0.2.0
 
         'max_concurrency' default updated from None (no limit on concurrency)
@@ -1056,13 +1057,21 @@ def _get_run(r: run_trees.RunTree) -> None:
 
 
 def _evaluators_include_attachments(
-    evaluators: Sequence[Union[EVALUATOR_T, AEVALUATOR_T]],
+    evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]],
 ) -> bool:
+    if evaluators is None:
+        return False
     return any(
         any(
             p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
             and p.name == "attachments"
-            for p in inspect.signature(e.__call__).parameters.values()
+            for p in (
+                inspect.signature(
+                    e.__call__ if hasattr(e, "__call__") else e
+                ).parameters.values()
+                if callable(e) or hasattr(e, "__call__")
+                else []
+            )
         )
         for e in evaluators
     )
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index ff0d3def3..8d42d1847 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -1914,12 +1914,22 @@ def _ensure_traceable(
     return fn
 
 
-def _evaluators_include_attachments(evaluators: Sequence[EVALUATOR_T]) -> bool:
+def _evaluators_include_attachments(
+    evaluators: Optional[Sequence[EVALUATOR_T]],
+) -> bool:
+    if evaluators is None:
+        return False
     return any(
         any(
             p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
             and p.name == "attachments"
-            for p in inspect.signature(e.__call__).parameters.values()
+            for p in (
+                inspect.signature(
+                    e.__call__ if hasattr(e, "__call__") else e
+                ).parameters.values()
+                if callable(e) or hasattr(e, "__call__")
+                else []
+            )
         )
         for e in evaluators
     )
diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 0ebb7858e..13aa8c41e 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -666,7 +666,7 @@ async def awrapper(
                     "example": example,
                     "inputs": example.inputs if example else {},
                     "outputs": run.outputs or {},
-                    "attachments": example.attachment_urls or {},
+                    "attachments": example.attachment_urls or {} if example else {},
                     "reference_outputs": example.outputs or {} if example else {},
                 }
                 args = (arg_map[arg] for arg in positional_args)

From 0cb211869f3e4c6150aaeab41209d523d1d33032 Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Mon, 9 Dec 2024 08:51:23 -0800
Subject: [PATCH 5/9] refactor

---
 python/langsmith/evaluation/evaluator.py      |  4 +--
 python/tests/integration_tests/test_client.py | 32 ++++++++++---------
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 13aa8c41e..0300de424 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -666,7 +666,7 @@ async def awrapper(
                     "example": example,
                     "inputs": example.inputs if example else {},
                     "outputs": run.outputs or {},
-                    "attachments": example.attachment_urls or {} if example else {},
+                    "attachments": example.attachments_info or {} if example else {},
                     "reference_outputs": example.outputs or {} if example else {},
                 }
                 args = (arg_map[arg] for arg in positional_args)
@@ -687,7 +687,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
                     "example": example,
                     "inputs": example.inputs if example else {},
                     "outputs": run.outputs or {},
-                    "attachments": example.attachment_urls or {},
+                    "attachments": example.attachments_info or {},
                     "reference_outputs": example.outputs or {} if example else {},
                 }
                 args = (arg_map[arg] for arg in positional_args)
diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
index 7c756ac5d..e5bc09621 100644
--- a/python/tests/integration_tests/test_client.py
+++ b/python/tests/integration_tests/test_client.py
@@ -1262,12 +1262,12 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None:
         },
     )
 
-
     langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])
 
     def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]:
         # Verify we receive the attachment data
         assert "image" in attachments
+        assert "presigned_url" in attachments["image"]
         image_data = attachments["image"]["reader"]
         assert image_data.read() == b"fake image data for testing"
         return {"answer": "test image"}
@@ -1276,7 +1276,8 @@ def evaluator(
         outputs: dict, reference_outputs: dict, attachments: dict
     ) -> Dict[str, Any]:
         assert "image" in attachments
-        image_url, image_data = attachments["image"]
+        assert "presigned_url" in attachments["image"]
+        image_data = attachments["image"]["reader"]
         assert image_data.read() == b"fake image data for testing"
         return {
             "score": float(
@@ -1326,7 +1327,8 @@ def evaluator(
         outputs: dict, reference_outputs: dict, attachments: dict
     ) -> Dict[str, Any]:
         assert "image" in attachments
-        image_url, image_data = attachments["image"]
+        assert "presigned_url" in attachments["image"]
+        image_data = attachments["image"]["reader"]
         assert image_data.read() == b"fake image data for testing"
         return {
             "score": float(
@@ -1404,8 +1406,7 @@ async def test_aevaluate_with_attachments(langchain_client: Client) -> None:
         data_type=DataType.kv,
     )
 
-    example = ExampleUpsertWithAttachments(
-        dataset_id=dataset.id,
+    example = ExampleUploadWithAttachments(
         inputs={"question": "What is shown in the image?"},
         outputs={"answer": "test image"},
         attachments={
@@ -1413,14 +1414,15 @@ async def test_aevaluate_with_attachments(langchain_client: Client) -> None:
         },
     )
 
-    langchain_client.upsert_examples_multipart(upserts=[example])
+    langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])
 
     async def target(
         inputs: Dict[str, Any], attachments: Dict[str, Any]
     ) -> Dict[str, Any]:
         # Verify we receive the attachment data
         assert "image" in attachments
-        image_url, image_data = attachments["image"]
+        assert "presigned_url" in attachments["image"]
+        image_data = attachments["image"]["reader"]
         assert image_data.read() == b"fake image data for testing"
         return {"answer": "test image"}
 
@@ -1428,7 +1430,8 @@ async def evaluator(
         outputs: dict, reference_outputs: dict, attachments: dict
     ) -> Dict[str, Any]:
         assert "image" in attachments
-        image_url, image_data = attachments["image"]
+        assert "presigned_url" in attachments["image"]
+        image_data = attachments["image"]["reader"]
         assert image_data.read() == b"fake image data for testing"
         return {
             "score": float(
@@ -1458,8 +1461,7 @@ async def test_aevaluate_with_attachments_not_in_target(
         data_type=DataType.kv,
     )
 
-    example = ExampleUpsertWithAttachments(
-        dataset_id=dataset.id,
+    example = ExampleUploadWithAttachments(
         inputs={"question": "What is shown in the image?"},
         outputs={"answer": "test image"},
         attachments={
@@ -1467,7 +1469,7 @@ async def test_aevaluate_with_attachments_not_in_target(
         },
     )
 
-    langchain_client.upsert_examples_multipart(upserts=[example])
+    langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])
 
     async def target(inputs: Dict[str, Any]) -> Dict[str, Any]:
         # Verify we receive the attachment data
@@ -1477,7 +1479,8 @@ async def evaluator(
         outputs: dict, reference_outputs: dict, attachments: dict
     ) -> Dict[str, Any]:
         assert "image" in attachments
-        image_url, image_data = attachments["image"]
+        assert "presigned_url" in attachments["image"]
+        image_data = attachments["image"]["reader"]
         assert image_data.read() == b"fake image data for testing"
         return {
             "score": float(
@@ -1513,12 +1516,11 @@ async def test_aevaluate_with_no_attachments(langchain_client: Client) -> None:
     )
 
     # Verify we can create example the new way without attachments
-    example = ExampleUpsertWithAttachments(
-        dataset_id=dataset.id,
+    example = ExampleUploadWithAttachments(
         inputs={"question": "What is 3+1?"},
         outputs={"answer": "4"},
     )
-    langchain_client.upsert_examples_multipart(upserts=[example])
+    langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])
 
     async def target(
         inputs: Dict[str, Any], attachments: Dict[str, Any]

From 799f69c801db24efb617a31830b8bb22ac969f8a Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Mon, 9 Dec 2024 11:12:25 -0800
Subject: [PATCH 6/9] fmt

---
 python/langsmith/evaluation/evaluator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 0300de424..13aa8c41e 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -666,7 +666,7 @@ async def awrapper(
                     "example": example,
                     "inputs": example.inputs if example else {},
                     "outputs": run.outputs or {},
-                    "attachments": example.attachments_info or {} if example else {},
+                    "attachments": example.attachment_urls or {} if example else {},
                     "reference_outputs": example.outputs or {} if example else {},
                 }
                 args = (arg_map[arg] for arg in positional_args)
@@ -687,7 +687,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
                     "example": example,
                     "inputs": example.inputs if example else {},
                     "outputs": run.outputs or {},
-                    "attachments": example.attachments_info or {},
+                    "attachments": example.attachment_urls or {},
                     "reference_outputs": example.outputs or {} if example else {},
                 }
                 args = (arg_map[arg] for arg in positional_args)

From cedd8afc455e1a73fb8805f5d09e1945ebfc338a Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Mon, 9 Dec 2024 14:22:30 -0800
Subject: [PATCH 7/9] attachment_urls -> attachments

---
 python/langsmith/evaluation/evaluator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 13aa8c41e..a1505699a 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -666,7 +666,7 @@ async def awrapper(
                     "example": example,
                     "inputs": example.inputs if example else {},
                     "outputs": run.outputs or {},
-                    "attachments": example.attachment_urls or {} if example else {},
+                    "attachments": example.attachments or {} if example else {},
                     "reference_outputs": example.outputs or {} if example else {},
                 }
                 args = (arg_map[arg] for arg in positional_args)
@@ -687,7 +687,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
                     "example": example,
                     "inputs": example.inputs if example else {},
                     "outputs": run.outputs or {},
-                    "attachments": example.attachment_urls or {},
+                    "attachments": example.attachments or {},
                     "reference_outputs": example.outputs or {} if example else {},
                 }
                 args = (arg_map[arg] for arg in positional_args)

From b99cdc4a60fad88b5b3c2f8a5e3c3ede1a5629a3 Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Mon, 9 Dec 2024 16:11:14 -0800
Subject: [PATCH 8/9] fmt

---
 python/langsmith/evaluation/_arunner.py       | 66 +------------------
 python/langsmith/evaluation/_runner.py        |  5 +-
 .../unit_tests/evaluation/test_runner.py      |  1 +
 3 files changed, 4 insertions(+), 68 deletions(-)

diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
index 463ce86ff..7cee6bcf5 100644
--- a/python/langsmith/evaluation/_arunner.py
+++ b/python/langsmith/evaluation/_arunner.py
@@ -37,6 +37,7 @@
     DATA_T,
     EVALUATOR_T,
     ExperimentResultRow,
+    _evaluators_include_attachments,
     _ExperimentManagerMixin,
     _extract_feedback_keys,
     _ForwardResults,
@@ -1060,71 +1061,6 @@ def _get_run(r: run_trees.RunTree) -> None:
         )
 
 
-def _evaluators_include_attachments(
-    evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]],
-) -> bool:
-    if evaluators is None:
-        return False
-    return any(
-        any(
-            p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
-            and p.name == "attachments"
-            for p in (
-                inspect.signature(
-                    e.__call__ if hasattr(e, "__call__") else e
-                ).parameters.values()
-                if callable(e) or hasattr(e, "__call__")
-                else []
-            )
-        )
-        for e in evaluators
-    )
-
-
-def _include_attachments(
-    target: Union[ATARGET_T, Iterable[schemas.Run], AsyncIterable[dict], Runnable],
-) -> bool:
-    """Whether the target function accepts attachments."""
-    if _is_langchain_runnable(target) or not callable(target):
-        return False
-    # Check function signature
-    sig = inspect.signature(target)
-    params = list(sig.parameters.values())
-    positional_params = [
-        p
-        for p in params
-        if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
-        and p.default is p.empty
-    ]
-
-    if len(positional_params) == 0:
-        raise ValueError(
-            "Target function must accept at least one positional argument (inputs)"
-        )
-    elif len(positional_params) > 2:
-        raise ValueError(
-            "Target function must accept at most two positional "
-            "arguments (inputs, attachments)"
-        )
-    elif len(positional_params) == 2:
-        mismatches = []
-        for i, (p, expected) in enumerate(
-            zip(positional_params, ("inputs", "attachments"))
-        ):
-            if p.name != expected:
-                mismatches.append((i, p.name))
-
-        if mismatches:
-            raise ValueError(
-                "When target function has two positional arguments, they must be named "
-                "'inputs' and 'attachments', respectively. Received: "
-                + ",".join(f"'{p}' at index {i}" for i, p in mismatches)
-            )
-
-    return len(positional_params) == 2
-
-
-
 def _ensure_async_traceable(
     target: ATARGET_T,
 ) -> rh.SupportsLangsmithExtra[[dict], Awaitable]:
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index f01bb1fcb..ddbd9bf18 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -1914,9 +1914,8 @@ def _ensure_traceable(
     return fn
 
 
-
 def _evaluators_include_attachments(
-    evaluators: Optional[Sequence[EVALUATOR_T]],
+    evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]],
 ) -> bool:
     if evaluators is None:
         return False
@@ -1937,7 +1936,7 @@ def _evaluators_include_attachments(
 
 
 def _include_attachments(
-    target: Union[TARGET_T, Iterable[schemas.Run], Runnable],
+    target: Any,
 ) -> bool:
     """Whether the target function accepts attachments."""
     if _is_langchain_runnable(target) or not callable(target):
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index e45c2cb02..1863f35b0 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -421,6 +421,7 @@ def eval2(x, y, inputs):
                 client=client,
             )
 
+
 def test_evaluate_raises_for_async():
     async def my_func(inputs: dict):
         pass

From 24d0159e6b534d8a33a07d7ef4b3e487129dfd9b Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Mon, 9 Dec 2024 16:40:25 -0800
Subject: [PATCH 9/9] fmt

---
 python/tests/unit_tests/evaluation/test_runner.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
index 1863f35b0..e33d07fd5 100644
--- a/python/tests/unit_tests/evaluation/test_runner.py
+++ b/python/tests/unit_tests/evaluation/test_runner.py
@@ -910,9 +910,12 @@ def test_normalize_evaluator_func_valid(func, is_async):
     ds_name = "my-dataset"
     ds_id = "00886375-eb2a-4038-9032-efff60309896"
 
-    ds_examples = [_create_example(i) for i in range(10)]
+    ds_example_responses = [_create_example(i) for i in range(10)]
+    ds_examples = [e[0] for e in ds_example_responses]
     tenant_id = str(uuid.uuid4())
-    fake_request = FakeRequest(ds_id, ds_name, ds_examples, tenant_id)
+    fake_request = FakeRequest(
+        ds_id, ds_name, [e[1] for e in ds_example_responses], tenant_id
+    )
     session.request = fake_request.request
     client = Client(api_url="http://localhost:1984", api_key="123", session=session)
     client._tenant_id = tenant_id  # type: ignore
@@ -935,9 +938,12 @@ def test_normalize_evaluator_func_invalid(func, is_async):
     ds_name = "my-dataset"
     ds_id = "00886375-eb2a-4038-9032-efff60309896"
 
-    ds_examples = [_create_example(i) for i in range(10)]
+    ds_example_responses = [_create_example(i) for i in range(10)]
+    ds_examples = [e[0] for e in ds_example_responses]
     tenant_id = str(uuid.uuid4())
-    fake_request = FakeRequest(ds_id, ds_name, ds_examples, tenant_id)
+    fake_request = FakeRequest(
+        ds_id, ds_name, [e[1] for e in ds_example_responses], tenant_id
+    )
     session.request = fake_request.request
     client = Client(api_url="http://localhost:1984", api_key="123", session=session)
     client._tenant_id = tenant_id  # type: ignore