Skip to content

Commit

Permalink
add attachments to evaluate (#1237)
Browse files Browse the repository at this point in the history
  • Loading branch information
isahers1 authored Dec 10, 2024
1 parent 8bb0826 commit c841ec6
Show file tree
Hide file tree
Showing 5 changed files with 408 additions and 17 deletions.
5 changes: 4 additions & 1 deletion python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
DATA_T,
EVALUATOR_T,
ExperimentResultRow,
_evaluators_include_attachments,
_ExperimentManagerMixin,
_extract_feedback_keys,
_ForwardResults,
Expand Down Expand Up @@ -259,6 +260,7 @@ async def aevaluate(
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
.. versionchanged:: 0.2.0
'max_concurrency' default updated from None (no limit on concurrency)
Expand Down Expand Up @@ -476,7 +478,8 @@ async def _aevaluate(
description=description,
num_repetitions=num_repetitions,
runs=runs,
include_attachments=_include_attachments(target),
include_attachments=_include_attachments(target)
or _evaluators_include_attachments(evaluators),
upload_results=upload_results,
).astart()
cache_dir = ls_utils.get_cache_dir(None)
Expand Down
28 changes: 26 additions & 2 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1064,7 +1064,8 @@ def _evaluate(
# If provided, we don't need to create a new experiment.
runs=runs,
# Create or resolve the experiment.
include_attachments=_include_attachments(target),
include_attachments=_include_attachments(target)
or _evaluators_include_attachments(evaluators),
upload_results=upload_results,
).start()
cache_dir = ls_utils.get_cache_dir(None)
Expand Down Expand Up @@ -1913,7 +1914,30 @@ def _ensure_traceable(
return fn


def _include_attachments(target: Any) -> bool:
def _evaluators_include_attachments(
evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]],
) -> bool:
if evaluators is None:
return False
return any(
any(
p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
and p.name == "attachments"
for p in (
inspect.signature(
e.__call__ if hasattr(e, "__call__") else e
).parameters.values()
if callable(e) or hasattr(e, "__call__")
else []
)
)
for e in evaluators
)


def _include_attachments(
target: Any,
) -> bool:
"""Whether the target function accepts attachments."""
if _is_langchain_runnable(target) or not callable(target):
return False
Expand Down
11 changes: 10 additions & 1 deletion python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,14 @@ def _normalize_evaluator_func(
Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT],
Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]],
]:
supported_args = ("run", "example", "inputs", "outputs", "reference_outputs")
supported_args = (
"run",
"example",
"inputs",
"outputs",
"reference_outputs",
"attachments",
)
sig = inspect.signature(func)
positional_args = [
pname
Expand Down Expand Up @@ -659,6 +666,7 @@ async def awrapper(
"example": example,
"inputs": example.inputs if example else {},
"outputs": run.outputs or {},
"attachments": example.attachments or {} if example else {},
"reference_outputs": example.outputs or {} if example else {},
}
args = (arg_map[arg] for arg in positional_args)
Expand All @@ -679,6 +687,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
"example": example,
"inputs": example.inputs if example else {},
"outputs": run.outputs or {},
"attachments": example.attachments or {},
"reference_outputs": example.outputs or {} if example else {},
}
args = (arg_map[arg] for arg in positional_args)
Expand Down
227 changes: 214 additions & 13 deletions python/tests/integration_tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor

from langsmith.client import ID_TYPE, Client
from langsmith.evaluation import evaluate
from langsmith.evaluation import aevaluate, evaluate
from langsmith.schemas import (
AttachmentsOperations,
DataType,
Expand Down Expand Up @@ -1215,9 +1215,6 @@ def create_encoder(*args, **kwargs):
assert not caplog.records


@pytest.mark.skip(
reason="Need to land https://github.com/langchain-ai/langsmith-sdk/pull/1209 first"
)
def test_list_examples_attachments_keys(langchain_client: Client) -> None:
"""Test list_examples returns same keys with and without attachments."""
dataset_name = "__test_list_examples_attachments" + uuid4().hex[:4]
Expand Down Expand Up @@ -1256,6 +1253,7 @@ def test_list_examples_attachments_keys(langchain_client: Client) -> None:
def test_evaluate_with_attachments(langchain_client: Client) -> None:
"""Test evaluating examples with attachments."""
dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4]

# 1. Create dataset
dataset = langchain_client.create_dataset(
dataset_name,
Expand All @@ -1274,37 +1272,89 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None:

langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])

# 3. Define target function that uses attachments
def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]:
# Verify we receive the attachment data
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {"answer": "test image"}

# 4. Define simple evaluator
def evaluator(run: Run, example: Example) -> Dict[str, Any]:
def evaluator(
outputs: dict, reference_outputs: dict, attachments: dict
) -> Dict[str, Any]:
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {
"score": float(
run.outputs.get("answer") == example.outputs.get("answer") # type: ignore
reference_outputs.get("answer") == outputs.get("answer") # type: ignore
)
}

# 5. Run evaluation
results = evaluate(
results = langchain_client.evaluate(
target,
data=dataset_name,
evaluators=[evaluator],
num_repetitions=2,
)

assert len(results) == 2
for result in results:
assert result["evaluation_results"]["results"][0].score == 1.0

langchain_client.delete_dataset(dataset_name=dataset_name)


def test_evaluate_with_attachments_not_in_target(langchain_client: Client) -> None:
"""Test evaluating examples with attachments."""
dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4]
dataset = langchain_client.create_dataset(
dataset_name,
description="Test dataset for evals with attachments",
data_type=DataType.kv,
)

example = ExampleUploadWithAttachments(
dataset_id=dataset.id,
inputs={"question": "What is shown in the image?"},
outputs={"answer": "test image"},
attachments={
"image": ("image/png", b"fake image data for testing"),
},
)

langchain_client.upload_examples_multipart(uploads=[example])

def target(inputs: Dict[str, Any]) -> Dict[str, Any]:
# Verify we receive the attachment data
return {"answer": "test image"}

def evaluator(
outputs: dict, reference_outputs: dict, attachments: dict
) -> Dict[str, Any]:
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {
"score": float(
reference_outputs.get("answer") == outputs.get("answer") # type: ignore
)
}

results = langchain_client.evaluate(
target,
data=dataset_name,
evaluators=[evaluator],
client=langchain_client,
num_repetitions=2,
)

# 6. Verify results
assert len(results) == 2
for result in results:
assert result["evaluation_results"]["results"][0].score == 1.0

# Cleanup
langchain_client.delete_dataset(dataset_name=dataset_name)


Expand Down Expand Up @@ -1355,6 +1405,157 @@ def evaluator(run: Run, example: Example) -> Dict[str, Any]:
langchain_client.delete_dataset(dataset_name=dataset_name)


async def test_aevaluate_with_attachments(langchain_client: Client) -> None:
"""Test evaluating examples with attachments."""
dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4]
dataset = langchain_client.create_dataset(
dataset_name,
description="Test dataset for evals with attachments",
data_type=DataType.kv,
)

example = ExampleUploadWithAttachments(
inputs={"question": "What is shown in the image?"},
outputs={"answer": "test image"},
attachments={
"image": ("image/png", b"fake image data for testing"),
},
)

langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])

async def target(
inputs: Dict[str, Any], attachments: Dict[str, Any]
) -> Dict[str, Any]:
# Verify we receive the attachment data
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {"answer": "test image"}

async def evaluator(
outputs: dict, reference_outputs: dict, attachments: dict
) -> Dict[str, Any]:
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {
"score": float(
reference_outputs.get("answer") == outputs.get("answer") # type: ignore
)
}

results = await langchain_client.aevaluate(
target, data=dataset_name, evaluators=[evaluator], num_repetitions=2
)

assert len(results) == 2
async for result in results:
assert result["evaluation_results"]["results"][0].score == 1.0

langchain_client.delete_dataset(dataset_name=dataset_name)


async def test_aevaluate_with_attachments_not_in_target(
langchain_client: Client,
) -> None:
"""Test evaluating examples with attachments."""
dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4]
dataset = langchain_client.create_dataset(
dataset_name,
description="Test dataset for evals with attachments",
data_type=DataType.kv,
)

example = ExampleUploadWithAttachments(
inputs={"question": "What is shown in the image?"},
outputs={"answer": "test image"},
attachments={
"image": ("image/png", b"fake image data for testing"),
},
)

langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])

async def target(inputs: Dict[str, Any]) -> Dict[str, Any]:
# Verify we receive the attachment data
return {"answer": "test image"}

async def evaluator(
outputs: dict, reference_outputs: dict, attachments: dict
) -> Dict[str, Any]:
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {
"score": float(
reference_outputs.get("answer") == outputs.get("answer") # type: ignore
)
}

results = await langchain_client.aevaluate(
target, data=dataset_name, evaluators=[evaluator], num_repetitions=2
)

assert len(results) == 2
async for result in results:
assert result["evaluation_results"]["results"][0].score == 1.0

langchain_client.delete_dataset(dataset_name=dataset_name)


async def test_aevaluate_with_no_attachments(langchain_client: Client) -> None:
"""Test evaluating examples without attachments using a target with attachments."""
dataset_name = "__test_aevaluate_no_attachments" + uuid4().hex[:4]
dataset = langchain_client.create_dataset(
dataset_name,
description="Test dataset for evals without attachments",
data_type=DataType.kv,
)

# Create example using old way, attachments should be set to {}
langchain_client.create_example(
dataset_id=dataset.id,
inputs={"question": "What is 2+2?"},
outputs={"answer": "4"},
)

# Verify we can create example the new way without attachments
example = ExampleUploadWithAttachments(
inputs={"question": "What is 3+1?"},
outputs={"answer": "4"},
)
langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])

async def target(
inputs: Dict[str, Any], attachments: Dict[str, Any]
) -> Dict[str, Any]:
# Verify we receive an empty attachments dict
assert isinstance(attachments, dict)
assert len(attachments) == 0
return {"answer": "4"}

async def evaluator(run: Run, example: Example) -> Dict[str, Any]:
return {
"score": float(
run.outputs.get("answer") == example.outputs.get("answer") # type: ignore
)
}

results = await aevaluate(
target, data=dataset_name, evaluators=[evaluator], client=langchain_client
)

assert len(results) == 2
async for result in results:
assert result["evaluation_results"]["results"][0].score == 1.0

langchain_client.delete_dataset(dataset_name=dataset_name)


def test_examples_length_validation(langchain_client: Client) -> None:
"""Test that mismatched lengths raise ValueError for create and update examples."""
dataset_name = "__test_examples_length_validation" + uuid4().hex[:4]
Expand Down
Loading

0 comments on commit c841ec6

Please sign in to comment.