Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add attachments to evaluate #1237

Merged
merged 21 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
9bd5969
wip
isahers1 Nov 20, 2024
a72a268
rip keys
isahers1 Nov 20, 2024
16e5e69
Merge branch 'isaac/multipartstuff' into isaac/addattachmentsevaluator
isahers1 Dec 2, 2024
0b6e2c4
Merge branch 'isaac/multipartstuff' into isaac/addattachmentsevaluator
isahers1 Dec 6, 2024
7307836
changes
isahers1 Dec 6, 2024
cf53bbe
fmt
isahers1 Dec 7, 2024
2a87196
Merge branch 'isaac/multipartstuff' into isaac/addattachmentsevaluator
isahers1 Dec 9, 2024
0cb2118
refactor
isahers1 Dec 9, 2024
3c92c38
Merge branch 'isaac/multipartstuff' into isaac/addattachmentsevaluator
isahers1 Dec 9, 2024
a342f86
Merge branch 'isaac/multipartstuff' into isaac/addattachmentsevaluator
isahers1 Dec 9, 2024
9289225
Merge branch 'isaac/multipartstuff' into isaac/addattachmentsevaluator
isahers1 Dec 9, 2024
8986216
Merge branch 'isaac/multipartstuff' into isaac/addattachmentsevaluator
isahers1 Dec 9, 2024
799f69c
fmt
isahers1 Dec 9, 2024
ac16178
Merge branch 'isaac/multipartstuff' into isaac/addattachmentsevaluator
isahers1 Dec 9, 2024
bc36039
Merge branch 'isaac/multipartstuff' into isaac/addattachmentsevaluator
isahers1 Dec 9, 2024
cedd8af
attachment_urls -> attachments
isahers1 Dec 9, 2024
8cc8ce3
Merge branch 'isaac/multipartstuff' into isaac/addattachmentsevaluator
isahers1 Dec 9, 2024
14130fa
Merge branch 'isaac/multipartstuff' into isaac/addattachmentsevaluator
isahers1 Dec 9, 2024
b99cdc4
fmt
isahers1 Dec 10, 2024
4d27f41
Merge branch 'isaac/multipartstuff' into isaac/addattachmentsevaluator
isahers1 Dec 10, 2024
24d0159
fmt
isahers1 Dec 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""V2 Evaluation Interface."""

Check notice on line 1 in python/langsmith/evaluation/_arunner.py

View workflow job for this annotation

GitHub Actions / benchmark

Benchmark results

........... WARNING: the benchmark result may be unstable * the standard deviation (77.7 ms) is 11% of the mean (676 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. create_5_000_run_trees: Mean +- std dev: 676 ms +- 78 ms ........... WARNING: the benchmark result may be unstable * the standard deviation (162 ms) is 12% of the mean (1.40 sec) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. create_10_000_run_trees: Mean +- std dev: 1.40 sec +- 0.16 sec ........... WARNING: the benchmark result may be unstable * the standard deviation (152 ms) is 11% of the mean (1.36 sec) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. create_20_000_run_trees: Mean +- std dev: 1.36 sec +- 0.15 sec ........... dumps_class_nested_py_branch_and_leaf_200x400: Mean +- std dev: 697 us +- 8 us ........... dumps_class_nested_py_leaf_50x100: Mean +- std dev: 25.1 ms +- 0.5 ms ........... dumps_class_nested_py_leaf_100x200: Mean +- std dev: 104 ms +- 2 ms ........... dumps_dataclass_nested_50x100: Mean +- std dev: 25.4 ms +- 0.4 ms ........... WARNING: the benchmark result may be unstable * the standard deviation (16.6 ms) is 23% of the mean (72.3 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydantic_nested_50x100: Mean +- std dev: 72.3 ms +- 16.6 ms ........... dumps_pydanticv1_nested_50x100: Mean +- std dev: 199 ms +- 2 ms

Check notice on line 1 in python/langsmith/evaluation/_arunner.py

View workflow job for this annotation

GitHub Actions / benchmark

Comparison against main

+-----------------------------------------------+----------+------------------------+ | Benchmark | main | changes | +===============================================+==========+========================+ | dumps_pydanticv1_nested_50x100 | 216 ms | 199 ms: 1.09x faster | +-----------------------------------------------+----------+------------------------+ | create_5_000_run_trees | 703 ms | 676 ms: 1.04x faster | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_branch_and_leaf_200x400 | 701 us | 697 us: 1.00x faster | +-----------------------------------------------+----------+------------------------+ | create_20_000_run_trees | 1.37 sec | 1.36 sec: 1.00x faster | +-----------------------------------------------+----------+------------------------+ | dumps_dataclass_nested_50x100 | 25.2 ms | 25.4 ms: 1.01x slower | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_50x100 | 24.8 ms | 25.1 ms: 1.01x slower | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_100x200 | 103 ms | 104 ms: 1.02x slower | +-----------------------------------------------+----------+------------------------+ | create_10_000_run_trees | 1.37 sec | 1.40 sec: 1.02x slower | +-----------------------------------------------+----------+------------------------+ | dumps_pydantic_nested_50x100 | 65.1 ms | 72.3 ms: 1.11x slower | +-----------------------------------------------+----------+------------------------+ | Geometric mean | (ref) | 1.00x slower | +-----------------------------------------------+----------+------------------------+

from __future__ import annotations

Expand Down Expand Up @@ -37,6 +37,7 @@
DATA_T,
EVALUATOR_T,
ExperimentResultRow,
_evaluators_include_attachments,
_ExperimentManagerMixin,
_extract_feedback_keys,
_ForwardResults,
Expand Down Expand Up @@ -259,6 +260,7 @@
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...


.. versionchanged:: 0.2.0

'max_concurrency' default updated from None (no limit on concurrency)
Expand Down Expand Up @@ -476,7 +478,8 @@
description=description,
num_repetitions=num_repetitions,
runs=runs,
include_attachments=_include_attachments(target),
include_attachments=_include_attachments(target)
or _evaluators_include_attachments(evaluators),
upload_results=upload_results,
).astart()
cache_dir = ls_utils.get_cache_dir(None)
Expand Down
28 changes: 26 additions & 2 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1064,7 +1064,8 @@ def _evaluate(
# If provided, we don't need to create a new experiment.
runs=runs,
# Create or resolve the experiment.
include_attachments=_include_attachments(target),
include_attachments=_include_attachments(target)
or _evaluators_include_attachments(evaluators),
upload_results=upload_results,
).start()
cache_dir = ls_utils.get_cache_dir(None)
Expand Down Expand Up @@ -1913,7 +1914,30 @@ def _ensure_traceable(
return fn


def _include_attachments(target: Any) -> bool:
def _evaluators_include_attachments(
evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]],
) -> bool:
if evaluators is None:
return False
return any(
any(
p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
and p.name == "attachments"
for p in (
inspect.signature(
e.__call__ if hasattr(e, "__call__") else e
).parameters.values()
if callable(e) or hasattr(e, "__call__")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ooc what kinds of objects have __call__ but aren't callable?

else []
)
)
for e in evaluators
)


def _include_attachments(
target: Any,
) -> bool:
"""Whether the target function accepts attachments."""
if _is_langchain_runnable(target) or not callable(target):
return False
Expand Down
11 changes: 10 additions & 1 deletion python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,14 @@ def _normalize_evaluator_func(
Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT],
Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]],
]:
supported_args = ("run", "example", "inputs", "outputs", "reference_outputs")
supported_args = (
"run",
"example",
"inputs",
"outputs",
"reference_outputs",
"attachments",
isahers1 marked this conversation as resolved.
Show resolved Hide resolved
)
sig = inspect.signature(func)
positional_args = [
pname
Expand Down Expand Up @@ -659,6 +666,7 @@ async def awrapper(
"example": example,
"inputs": example.inputs if example else {},
"outputs": run.outputs or {},
"attachments": example.attachments or {} if example else {},
"reference_outputs": example.outputs or {} if example else {},
}
args = (arg_map[arg] for arg in positional_args)
Expand All @@ -679,6 +687,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
"example": example,
"inputs": example.inputs if example else {},
"outputs": run.outputs or {},
"attachments": example.attachments or {},
"reference_outputs": example.outputs or {} if example else {},
}
args = (arg_map[arg] for arg in positional_args)
Expand Down
227 changes: 214 additions & 13 deletions python/tests/integration_tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor

from langsmith.client import ID_TYPE, Client
from langsmith.evaluation import evaluate
from langsmith.evaluation import aevaluate, evaluate
from langsmith.schemas import (
DataType,
Example,
Expand Down Expand Up @@ -1213,9 +1213,6 @@ def create_encoder(*args, **kwargs):
assert not caplog.records


@pytest.mark.skip(
reason="Need to land https://github.com/langchain-ai/langsmith-sdk/pull/1209 first"
)
def test_list_examples_attachments_keys(langchain_client: Client) -> None:
"""Test list_examples returns same keys with and without attachments."""
dataset_name = "__test_list_examples_attachments" + uuid4().hex[:4]
Expand Down Expand Up @@ -1254,6 +1251,7 @@ def test_list_examples_attachments_keys(langchain_client: Client) -> None:
def test_evaluate_with_attachments(langchain_client: Client) -> None:
"""Test evaluating examples with attachments."""
dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4]

# 1. Create dataset
dataset = langchain_client.create_dataset(
dataset_name,
Expand All @@ -1272,37 +1270,89 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None:

langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])

# 3. Define target function that uses attachments
def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]:
# Verify we receive the attachment data
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {"answer": "test image"}

# 4. Define simple evaluator
def evaluator(run: Run, example: Example) -> Dict[str, Any]:
def evaluator(
outputs: dict, reference_outputs: dict, attachments: dict
) -> Dict[str, Any]:
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {
"score": float(
run.outputs.get("answer") == example.outputs.get("answer") # type: ignore
reference_outputs.get("answer") == outputs.get("answer") # type: ignore
)
}

# 5. Run evaluation
results = evaluate(
results = langchain_client.evaluate(
target,
data=dataset_name,
evaluators=[evaluator],
num_repetitions=2,
)

assert len(results) == 2
for result in results:
assert result["evaluation_results"]["results"][0].score == 1.0

langchain_client.delete_dataset(dataset_name=dataset_name)


def test_evaluate_with_attachments_not_in_target(langchain_client: Client) -> None:
"""Test evaluating examples with attachments."""
dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4]
dataset = langchain_client.create_dataset(
dataset_name,
description="Test dataset for evals with attachments",
data_type=DataType.kv,
)

example = ExampleUploadWithAttachments(
dataset_id=dataset.id,
inputs={"question": "What is shown in the image?"},
outputs={"answer": "test image"},
attachments={
"image": ("image/png", b"fake image data for testing"),
},
)

langchain_client.upload_examples_multipart(uploads=[example])

def target(inputs: Dict[str, Any]) -> Dict[str, Any]:
# Verify we receive the attachment data
return {"answer": "test image"}

def evaluator(
outputs: dict, reference_outputs: dict, attachments: dict
) -> Dict[str, Any]:
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {
"score": float(
reference_outputs.get("answer") == outputs.get("answer") # type: ignore
)
}

results = langchain_client.evaluate(
target,
data=dataset_name,
evaluators=[evaluator],
client=langchain_client,
num_repetitions=2,
)

# 6. Verify results
assert len(results) == 2
for result in results:
assert result["evaluation_results"]["results"][0].score == 1.0

# Cleanup
langchain_client.delete_dataset(dataset_name=dataset_name)


Expand Down Expand Up @@ -1353,6 +1403,157 @@ def evaluator(run: Run, example: Example) -> Dict[str, Any]:
langchain_client.delete_dataset(dataset_name=dataset_name)


async def test_aevaluate_with_attachments(langchain_client: Client) -> None:
"""Test evaluating examples with attachments."""
dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4]
dataset = langchain_client.create_dataset(
dataset_name,
description="Test dataset for evals with attachments",
data_type=DataType.kv,
)

example = ExampleUploadWithAttachments(
inputs={"question": "What is shown in the image?"},
outputs={"answer": "test image"},
attachments={
"image": ("image/png", b"fake image data for testing"),
},
)

langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])

async def target(
inputs: Dict[str, Any], attachments: Dict[str, Any]
) -> Dict[str, Any]:
# Verify we receive the attachment data
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {"answer": "test image"}

async def evaluator(
outputs: dict, reference_outputs: dict, attachments: dict
) -> Dict[str, Any]:
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {
"score": float(
reference_outputs.get("answer") == outputs.get("answer") # type: ignore
)
}

results = await langchain_client.aevaluate(
target, data=dataset_name, evaluators=[evaluator], num_repetitions=2
)

assert len(results) == 2
async for result in results:
assert result["evaluation_results"]["results"][0].score == 1.0

langchain_client.delete_dataset(dataset_name=dataset_name)


async def test_aevaluate_with_attachments_not_in_target(
langchain_client: Client,
) -> None:
"""Test evaluating examples with attachments."""
dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4]
dataset = langchain_client.create_dataset(
dataset_name,
description="Test dataset for evals with attachments",
data_type=DataType.kv,
)

example = ExampleUploadWithAttachments(
inputs={"question": "What is shown in the image?"},
outputs={"answer": "test image"},
attachments={
"image": ("image/png", b"fake image data for testing"),
},
)

langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])

async def target(inputs: Dict[str, Any]) -> Dict[str, Any]:
# Verify we receive the attachment data
return {"answer": "test image"}

async def evaluator(
outputs: dict, reference_outputs: dict, attachments: dict
) -> Dict[str, Any]:
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {
"score": float(
reference_outputs.get("answer") == outputs.get("answer") # type: ignore
)
}

results = await langchain_client.aevaluate(
target, data=dataset_name, evaluators=[evaluator], num_repetitions=2
)

assert len(results) == 2
async for result in results:
assert result["evaluation_results"]["results"][0].score == 1.0

langchain_client.delete_dataset(dataset_name=dataset_name)


async def test_aevaluate_with_no_attachments(langchain_client: Client) -> None:
"""Test evaluating examples without attachments using a target with attachments."""
dataset_name = "__test_aevaluate_no_attachments" + uuid4().hex[:4]
dataset = langchain_client.create_dataset(
dataset_name,
description="Test dataset for evals without attachments",
data_type=DataType.kv,
)

# Create example using old way, attachments should be set to {}
langchain_client.create_example(
dataset_id=dataset.id,
inputs={"question": "What is 2+2?"},
outputs={"answer": "4"},
)

# Verify we can create example the new way without attachments
example = ExampleUploadWithAttachments(
inputs={"question": "What is 3+1?"},
outputs={"answer": "4"},
)
langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])

async def target(
inputs: Dict[str, Any], attachments: Dict[str, Any]
) -> Dict[str, Any]:
# Verify we receive an empty attachments dict
assert isinstance(attachments, dict)
assert len(attachments) == 0
return {"answer": "4"}

async def evaluator(run: Run, example: Example) -> Dict[str, Any]:
return {
"score": float(
run.outputs.get("answer") == example.outputs.get("answer") # type: ignore
)
}

results = await aevaluate(
isahers1 marked this conversation as resolved.
Show resolved Hide resolved
target, data=dataset_name, evaluators=[evaluator], client=langchain_client
)

assert len(results) == 2
async for result in results:
assert result["evaluation_results"]["results"][0].score == 1.0

langchain_client.delete_dataset(dataset_name=dataset_name)


def test_examples_length_validation(langchain_client: Client) -> None:
"""Test that mismatched lengths raise ValueError for create and update examples."""
dataset_name = "__test_examples_length_validation" + uuid4().hex[:4]
Expand Down
Loading
Loading