From 82383fe3a6b96551e78dd8560e15c396b74f6011 Mon Sep 17 00:00:00 2001 From: Isaac Francisco <78627776+isahers1@users.noreply.github.com> Date: Tue, 10 Dec 2024 14:55:25 -0800 Subject: [PATCH] [python] support uploading examples with attachments and running evals on examples with attachments (#1209) Co-authored-by: Ankush Gola <9536492+agola11@users.noreply.github.com> Co-authored-by: Bagatur Co-authored-by: Jake Rachleff Co-authored-by: William Fu-Hinthorn <13333726+hinthornw@users.noreply.github.com> Co-authored-by: Ankush Gola --- ...load_example_with_large_file_attachment.py | 111 +++ python/bench/upload_examples_bench.py | 143 +++ python/langsmith/client.py | 320 ++++++- python/langsmith/evaluation/_arunner.py | 80 +- python/langsmith/evaluation/_runner.py | 166 +++- python/langsmith/evaluation/evaluator.py | 11 +- python/langsmith/schemas.py | 85 +- python/pyproject.toml | 2 +- python/tests/integration_tests/test_client.py | 838 +++++++++++++++++- .../unit_tests/evaluation/test_runner.py | 307 ++++++- python/tests/unit_tests/test_client.py | 91 ++ 11 files changed, 2107 insertions(+), 47 deletions(-) create mode 100644 python/bench/upload_example_with_large_file_attachment.py create mode 100644 python/bench/upload_examples_bench.py diff --git a/python/bench/upload_example_with_large_file_attachment.py b/python/bench/upload_example_with_large_file_attachment.py new file mode 100644 index 000000000..8aaedd696 --- /dev/null +++ b/python/bench/upload_example_with_large_file_attachment.py @@ -0,0 +1,111 @@ +import os +import statistics +import time +from pathlib import Path +from typing import Dict + +from langsmith import Client +from langsmith.schemas import ExampleUpsertWithAttachments + +WRITE_BATCH = 10000 + + +def create_large_file(size: int, dir: str) -> str: + """Create a large file for benchmarking purposes.""" + filename = f"large_file_{size}.txt" + filepath = os.path.join(dir, filename) + + # delete the file if it exists + print("Deleting existing file...") + if os.path.exists(filepath): + os.remove(filepath) + + print("Creating big file...") + with open(filepath, "w") as f: + curr_size = 0 + while curr_size < size: + f.write("a" * (size - curr_size)) + curr_size += size - curr_size + + print("Done creating big file...") + return filepath + + +DATASET_NAME = "upsert_big_file_to_dataset" + + +def benchmark_big_file_upload( + size_bytes: int, num_examples: int, samples: int = 1 +) -> Dict: + """ + Benchmark run creation with specified parameters. + Returns timing statistics. + """ + multipart_timings = [] + + for _ in range(samples): + client = Client() + + if client.has_dataset(dataset_name=DATASET_NAME): + client.delete_dataset(dataset_name=DATASET_NAME) + + dataset = client.create_dataset( + DATASET_NAME, + description="Test dataset for big file upload", + ) + large_file = create_large_file(size_bytes, "/tmp") + examples = [ + ExampleUpsertWithAttachments( + dataset_id=dataset.id, + inputs={"a": 1}, + outputs={"b": 2}, + attachments={ + "bigfile": ("text/plain", Path(large_file)), + }, + ) + for _ in range(num_examples) + ] + + multipart_start = time.perf_counter() + client.upsert_examples_multipart(upserts=examples) + multipart_elapsed = time.perf_counter() - multipart_start + + multipart_timings.append(multipart_elapsed) + + return { + "mean": statistics.mean(multipart_timings), + "median": statistics.median(multipart_timings), + "stdev": ( + statistics.stdev(multipart_timings) if len(multipart_timings) > 1 else 0 + ), + "min": min(multipart_timings), + "max": max(multipart_timings), + } + + +size_bytes = 50000000 +num_examples = 10 + + +def main(size_bytes: int, num_examples: int = 1): + """ + Run benchmarks with different combinations of parameters and report results. + """ + results = benchmark_big_file_upload(size_bytes, num_examples) + + print(f"\nBenchmark Results for size {size_bytes} and {num_examples} examples:") + print("-" * 30) + print(f"{'Metric':<15} {'Result':>20}") + print("-" * 30) + + metrics = ["mean", "median", "stdev", "min", "max"] + for metric in metrics: + print(f"{results[metric]:>20.4f}") + + print("-" * 30) + print(f"{'Throughput':<15} {num_examples / results['mean']:>20.2f} ") + print("(examples/second)") + + +if __name__ == "__main__": + main(size_bytes, num_examples) diff --git a/python/bench/upload_examples_bench.py b/python/bench/upload_examples_bench.py new file mode 100644 index 000000000..5a22a731b --- /dev/null +++ b/python/bench/upload_examples_bench.py @@ -0,0 +1,143 @@ +import statistics +import time +from typing import Dict +from uuid import uuid4 + +from langsmith import Client +from langsmith.schemas import DataType, ExampleUpsertWithAttachments + + +def create_large_json(length: int) -> Dict: + """Create a large JSON object for benchmarking purposes.""" + large_array = [ + { + "index": i, + "data": f"This is element number {i}", + "nested": {"id": i, "value": f"Nested value for element {i}"}, + } + for i in range(length) + ] + + return { + "name": "Huge JSON" + str(uuid4()), + "description": "This is a very large JSON object for benchmarking purposes.", + "array": large_array, + "metadata": { + "created_at": "2024-10-22T19:00:00Z", + "author": "Python Program", + "version": 1.0, + }, + } + + +def create_example_data(dataset_id: str, json_size: int) -> Dict: + """Create a single example data object.""" + return ExampleUpsertWithAttachments( + **{ + "dataset_id": dataset_id, + "inputs": create_large_json(json_size), + "outputs": create_large_json(json_size), + } + ) + + +DATASET_NAME = "upsert_llm_evaluator_benchmark_dataset" + + +def benchmark_example_uploading( + num_examples: int, json_size: int, samples: int = 1 +) -> Dict: + """ + Benchmark run creation with specified parameters. + Returns timing statistics. + """ + multipart_timings, old_timings = [], [] + + for _ in range(samples): + client = Client() + + if client.has_dataset(dataset_name=DATASET_NAME): + client.delete_dataset(dataset_name=DATASET_NAME) + + dataset = client.create_dataset( + DATASET_NAME, + description="Test dataset for multipart example upload", + data_type=DataType.kv, + ) + examples = [ + create_example_data(dataset.id, json_size) for i in range(num_examples) + ] + + # Old method + old_start = time.perf_counter() + # inputs = [e.inputs for e in examples] + # outputs = [e.outputs for e in examples] + # # the create_examples endpoint fails above 20mb + # # so this will crash with json_size > ~100 + # client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id) + old_elapsed = time.perf_counter() - old_start + + # New method + multipart_start = time.perf_counter() + client.upsert_examples_multipart(upserts=examples) + multipart_elapsed = time.perf_counter() - multipart_start + + multipart_timings.append(multipart_elapsed) + old_timings.append(old_elapsed) + + return { + "old": { + "mean": statistics.mean(old_timings), + "median": statistics.median(old_timings), + "stdev": statistics.stdev(old_timings) if len(old_timings) > 1 else 0, + "min": min(old_timings), + "max": max(old_timings), + }, + "new": { + "mean": statistics.mean(multipart_timings), + "median": statistics.median(multipart_timings), + "stdev": ( + statistics.stdev(multipart_timings) if len(multipart_timings) > 1 else 0 + ), + "min": min(multipart_timings), + "max": max(multipart_timings), + }, + } + + +json_size = 1000 +num_examples = 1000 + + +def main(json_size: int, num_examples: int): + """ + Run benchmarks with different combinations of parameters and report results. + """ + results = benchmark_example_uploading( + num_examples=num_examples, json_size=json_size + ) + + print( + f"\nBenchmark Results for {num_examples} examples with JSON size {json_size}:" + ) + print("-" * 60) + print(f"{'Metric':<15} {'Old Method':>20} {'New Method':>20}") + print("-" * 60) + + metrics = ["mean", "median", "stdev", "min", "max"] + for metric in metrics: + print( + f"{metric:<15} {results['old'][metric]:>20.4f} " + f"{results['new'][metric]:>20.4f}" + ) + + print("-" * 60) + print( + f"{'Throughput':<15} {num_examples / results['old']['mean']:>20.2f} " + f"{num_examples / results['new']['mean']:>20.2f}" + ) + print("(examples/second)") + + +if __name__ == "__main__": + main(json_size, num_examples) diff --git a/python/langsmith/client.py b/python/langsmith/client.py index e422dbec1..a92a89659 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -34,6 +34,7 @@ import warnings import weakref from inspect import signature +from pathlib import Path from queue import PriorityQueue from typing import ( TYPE_CHECKING, @@ -83,6 +84,7 @@ _SIZE_LIMIT_BYTES, ) from langsmith._internal._multipart import ( + MultipartPart, MultipartPartsAndContext, join_multipart_parts_and_context, ) @@ -946,7 +948,6 @@ def _get_paginated_list( params=params_, ) items = response.json() - if not items: break yield from items @@ -1682,9 +1683,7 @@ def update_run( events: Optional[Sequence[dict]] = None, extra: Optional[Dict] = None, tags: Optional[List[str]] = None, - attachments: Optional[ - Dict[str, tuple[str, bytes] | ls_schemas.Attachment] - ] = None, + attachments: Optional[ls_schemas.Attachments] = None, **kwargs: Any, ) -> None: """Update a run in the LangSmith API. @@ -3463,6 +3462,262 @@ def create_example_from_run( created_at=created_at, ) + def _prepare_multipart_data( + self, + examples: Union[ + List[ls_schemas.ExampleUploadWithAttachments] + | List[ls_schemas.ExampleUpsertWithAttachments] + | List[ls_schemas.ExampleUpdateWithAttachments], + ], + include_dataset_id: bool = False, + ) -> Tuple[Any, bytes]: + parts: List[MultipartPart] = [] + if include_dataset_id: + if not isinstance(examples[0], ls_schemas.ExampleUpsertWithAttachments): + raise ValueError( + "The examples must be of type ExampleUpsertWithAttachments" + " if include_dataset_id is True" + ) + dataset_id = examples[0].dataset_id + + for example in examples: + if ( + not isinstance(example, ls_schemas.ExampleUploadWithAttachments) + and not isinstance(example, ls_schemas.ExampleUpsertWithAttachments) + and not isinstance(example, ls_schemas.ExampleUpdateWithAttachments) + ): + raise ValueError( + "The examples must be of type ExampleUploadWithAttachments" + " or ExampleUpsertWithAttachments" + " or ExampleUpdateWithAttachments" + ) + if example.id is not None: + example_id = str(example.id) + else: + example_id = str(uuid.uuid4()) + + if isinstance(example, ls_schemas.ExampleUpdateWithAttachments): + created_at = None + else: + created_at = example.created_at + + example_body = { + **({"dataset_id": dataset_id} if include_dataset_id else {}), + **({"created_at": created_at} if created_at is not None else {}), + } + if example.metadata is not None: + example_body["metadata"] = example.metadata + if example.split is not None: + example_body["split"] = example.split + valb = _dumps_json(example_body) + + parts.append( + ( + f"{example_id}", + ( + None, + valb, + "application/json", + {}, + ), + ) + ) + + inputsb = _dumps_json(example.inputs) + + parts.append( + ( + f"{example_id}.inputs", + ( + None, + inputsb, + "application/json", + {}, + ), + ) + ) + + if example.outputs: + outputsb = _dumps_json(example.outputs) + parts.append( + ( + f"{example_id}.outputs", + ( + None, + outputsb, + "application/json", + {}, + ), + ) + ) + + if example.attachments: + for name, attachment in example.attachments.items(): + if isinstance(attachment, tuple): + if isinstance(attachment[1], Path): + mime_type, file_path = attachment + file_size = os.path.getsize(file_path) + parts.append( + ( + f"{example_id}.attachment.{name}", + ( + None, + open(file_path, "rb"), # type: ignore[arg-type] + f"{mime_type}; length={file_size}", + {}, + ), + ) + ) + else: + mime_type, data = attachment + parts.append( + ( + f"{example_id}.attachment.{name}", + ( + None, + data, + f"{mime_type}; length={len(data)}", + {}, + ), + ) + ) + else: + parts.append( + ( + f"{example_id}.attachment.{name}", + ( + None, + attachment.data, + f"{attachment.mime_type}; length={len(attachment.data)}", + {}, + ), + ) + ) + + if ( + isinstance(example, ls_schemas.ExampleUpdateWithAttachments) + and example.attachments_operations + ): + attachments_operationsb = _dumps_json(example.attachments_operations) + parts.append( + ( + f"{example_id}.attachments_operations", + ( + None, + attachments_operationsb, + "application/json", + {}, + ), + ) + ) + + encoder = rqtb_multipart.MultipartEncoder(parts, boundary=BOUNDARY) + if encoder.len <= 20_000_000: # ~20 MB + data = encoder.to_string() + else: + data = encoder + + return encoder, data + + def update_examples_multipart( + self, + *, + dataset_id: ID_TYPE, + updates: Optional[List[ls_schemas.ExampleUpdateWithAttachments]] = None, + ) -> ls_schemas.UpsertExamplesResponse: + """Upload examples.""" + if not (self.info.instance_flags or {}).get( + "dataset_examples_multipart_enabled", False + ): + raise ValueError( + "Your LangSmith version does not allow using the multipart examples endpoint, please update to the latest version." + ) + if updates is None: + updates = [] + + encoder, data = self._prepare_multipart_data(updates, include_dataset_id=False) + + response = self.request_with_retries( + "PATCH", + f"/v1/platform/datasets/{dataset_id}/examples", + request_kwargs={ + "data": data, + "headers": { + **self._headers, + "Content-Type": encoder.content_type, + }, + }, + ) + ls_utils.raise_for_status_with_text(response) + return response.json() + + def upload_examples_multipart( + self, + *, + dataset_id: ID_TYPE, + uploads: Optional[List[ls_schemas.ExampleUploadWithAttachments]] = None, + ) -> ls_schemas.UpsertExamplesResponse: + """Upload examples.""" + if not (self.info.instance_flags or {}).get( + "dataset_examples_multipart_enabled", False + ): + raise ValueError( + "Your LangSmith version does not allow using the multipart examples endpoint, please update to the latest version." + ) + if uploads is None: + uploads = [] + encoder, data = self._prepare_multipart_data(uploads, include_dataset_id=False) + + response = self.request_with_retries( + "POST", + f"/v1/platform/datasets/{dataset_id}/examples", + request_kwargs={ + "data": data, + "headers": { + **self._headers, + "Content-Type": encoder.content_type, + }, + }, + ) + ls_utils.raise_for_status_with_text(response) + return response.json() + + def upsert_examples_multipart( + self, + *, + upserts: Optional[List[ls_schemas.ExampleUpsertWithAttachments]] = None, + ) -> ls_schemas.UpsertExamplesResponse: + """Upsert examples. + + .. deprecated:: 0.1.0 + This method is deprecated. Use :func:`langsmith.upload_examples_multipart` instead. + + """ # noqa: E501 + if not (self.info.instance_flags or {}).get( + "examples_multipart_enabled", False + ): + raise ValueError( + "Your LangSmith version does not allow using the multipart examples endpoint, please update to the latest version." + ) + if upserts is None: + upserts = [] + + encoder, data = self._prepare_multipart_data(upserts, include_dataset_id=True) + + response = self.request_with_retries( + "POST", + "/v1/platform/examples/multipart", + request_kwargs={ + "data": data, + "headers": { + **self._headers, + "Content-Type": encoder.content_type, + }, + }, + ) + ls_utils.raise_for_status_with_text(response) + return response.json() + def create_examples( self, *, @@ -3637,8 +3892,22 @@ def read_example( "as_of": as_of.isoformat() if as_of else None, }, ) + + example = response.json() + attachments = {} + if example["attachment_urls"]: + for key, value in example["attachment_urls"].items(): + response = requests.get(value["presigned_url"], stream=True) + response.raise_for_status() + reader = io.BytesIO(response.content) + attachments[key.split(".")[1]] = { + "presigned_url": value["presigned_url"], + "reader": reader, + } + return ls_schemas.Example( - **response.json(), + **{k: v for k, v in example.items() if k != "attachment_urls"}, + attachments=attachments, _host_url=self._host_url, _tenant_id=self._get_optional_tenant_id(), ) @@ -3656,6 +3925,7 @@ def list_examples( limit: Optional[int] = None, metadata: Optional[dict] = None, filter: Optional[str] = None, + include_attachments: bool = False, **kwargs: Any, ) -> Iterator[ls_schemas.Example]: """Retrieve the example rows of the specified dataset. @@ -3705,11 +3975,25 @@ def list_examples( params["dataset"] = dataset_id else: pass + if include_attachments: + params["select"] = ["attachment_urls", "outputs", "metadata"] for i, example in enumerate( self._get_paginated_list("/examples", params=params) ): + attachments = {} + if example["attachment_urls"]: + for key, value in example["attachment_urls"].items(): + response = requests.get(value["presigned_url"], stream=True) + response.raise_for_status() + reader = io.BytesIO(response.content) + attachments[key.split(".")[1]] = { + "presigned_url": value["presigned_url"], + "reader": reader, + } + yield ls_schemas.Example( - **example, + **{k: v for k, v in example.items() if k != "attachment_urls"}, + attachments=attachments, _host_url=self._host_url, _tenant_id=self._get_optional_tenant_id(), ) @@ -3849,6 +4133,7 @@ def update_example( metadata: Optional[Dict] = None, split: Optional[str | List[str]] = None, dataset_id: Optional[ID_TYPE] = None, + attachments_operations: Optional[ls_schemas.AttachmentsOperations] = None, ) -> Dict[str, Any]: """Update a specific example. @@ -3873,12 +4158,20 @@ def update_example( Dict[str, Any] The updated example. """ + if attachments_operations is not None: + if not (self.info.instance_flags or {}).get( + "dataset_examples_multipart_enabled", False + ): + raise ValueError( + "Your LangSmith version does not allow using the attachment operations, please update to the latest version." + ) example = dict( inputs=inputs, outputs=outputs, dataset_id=dataset_id, metadata=metadata, split=split, + attachments_operations=attachments_operations, ) response = self.request_with_retries( "PATCH", @@ -3898,6 +4191,9 @@ def update_examples( metadata: Optional[Sequence[Optional[Dict]]] = None, splits: Optional[Sequence[Optional[str | List[str]]]] = None, dataset_ids: Optional[Sequence[Optional[ID_TYPE]]] = None, + attachments_operations: Optional[ + Sequence[Optional[ls_schemas.AttachmentsOperations]] + ] = None, ) -> Dict[str, Any]: """Update multiple examples. @@ -3922,12 +4218,20 @@ def update_examples( Dict[str, Any] The response from the server (specifies the number of examples updated). """ + if attachments_operations is not None: + if not (self.info.instance_flags or {}).get( + "dataset_examples_multipart_enabled", False + ): + raise ValueError( + "Your LangSmith version does not allow using the attachment operations, please update to the latest version." + ) sequence_args = { "inputs": inputs, "outputs": outputs, "metadata": metadata, "splits": splits, "dataset_ids": dataset_ids, + "attachments_operations": attachments_operations, } # Since inputs are required, we will check against them examples_len = len(example_ids) @@ -3945,14 +4249,16 @@ def update_examples( "dataset_id": dataset_id_, "metadata": metadata_, "split": split_, + "attachments_operations": attachments_operations_, } - for id_, in_, out_, metadata_, split_, dataset_id_ in zip( + for id_, in_, out_, metadata_, split_, dataset_id_, attachments_operations_ in zip( example_ids, inputs or [None] * len(example_ids), outputs or [None] * len(example_ids), metadata or [None] * len(example_ids), splits or [None] * len(example_ids), dataset_ids or [None] * len(example_ids), + attachments_operations or [None] * len(example_ids), ) ] response = self.request_with_retries( diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py index 4d5c063f6..25ea0d62a 100644 --- a/python/langsmith/evaluation/_arunner.py +++ b/python/langsmith/evaluation/_arunner.py @@ -37,14 +37,17 @@ DATA_T, EVALUATOR_T, ExperimentResultRow, + _evaluators_include_attachments, _ExperimentManagerMixin, _extract_feedback_keys, _ForwardResults, + _include_attachments, _is_langchain_runnable, _load_examples_map, _load_experiment, _load_tqdm, _load_traces, + _make_fresh_examples, _resolve_data, _resolve_evaluators, _resolve_experiment, @@ -68,7 +71,9 @@ logger = logging.getLogger(__name__) -ATARGET_T = Callable[[dict], Awaitable[dict]] +ATARGET_T = Union[ + Callable[[dict], Awaitable[dict]], Callable[[dict, dict], Awaitable[dict]] +] async def aevaluate( @@ -256,6 +261,7 @@ async def aevaluate( ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... + .. versionchanged:: 0.2.0 'max_concurrency' default updated from None (no limit on concurrency) @@ -473,6 +479,8 @@ async def _aevaluate( description=description, num_repetitions=num_repetitions, runs=runs, + include_attachments=_include_attachments(target) + or _evaluators_include_attachments(evaluators), upload_results=upload_results, ).astart() cache_dir = ls_utils.get_cache_dir(None) @@ -534,6 +542,7 @@ def __init__( summary_results: Optional[AsyncIterable[EvaluationResults]] = None, description: Optional[str] = None, num_repetitions: int = 1, + include_attachments: bool = False, upload_results: bool = True, ): super().__init__( @@ -550,14 +559,23 @@ def __init__( self._evaluation_results = evaluation_results self._summary_results = summary_results self._num_repetitions = num_repetitions + self._include_attachments = include_attachments self._upload_results = upload_results async def aget_examples(self) -> AsyncIterator[schemas.Example]: if self._examples is None: - self._examples = _aresolve_data(self._data, client=self.client) + self._examples = _aresolve_data( + self._data, + client=self.client, + include_attachments=self._include_attachments, + ) if self._num_repetitions > 1: + examples_list = [example async for example in self._examples] self._examples = async_chain_from_iterable( - aitertools.atee(self._examples, self._num_repetitions) + [ + async_iter_from_list(_make_fresh_examples(examples_list)) + for _ in range(self._num_repetitions) + ] ) self._examples, examples_iter = aitertools.atee( @@ -620,6 +638,7 @@ async def astart(self) -> _AsyncExperimentManager: client=self.client, runs=self._runs, evaluation_results=self._evaluation_results, + include_attachments=self._include_attachments, upload_results=self._upload_results, ) @@ -629,7 +648,11 @@ async def awith_predictions( /, max_concurrency: Optional[int] = None, ) -> _AsyncExperimentManager: - _experiment_results = self._apredict(target, max_concurrency=max_concurrency) + _experiment_results = self._apredict( + target, + max_concurrency=max_concurrency, + include_attachments=_include_attachments(target), + ) r1, r2 = aitertools.atee(_experiment_results, 2, lock=asyncio.Lock()) return _AsyncExperimentManager( (pred["example"] async for pred in r1), @@ -637,6 +660,7 @@ async def awith_predictions( metadata=self._metadata, client=self.client, runs=(pred["run"] async for pred in r2), + include_attachments=self._include_attachments, upload_results=self._upload_results, ) @@ -657,6 +681,7 @@ async def awith_evaluators( runs=(result["run"] async for result in r2), evaluation_results=(result["evaluation_results"] async for result in r3), summary_results=self._summary_results, + include_attachments=self._include_attachments, upload_results=self._upload_results, ) @@ -674,6 +699,7 @@ async def awith_summary_evaluators( runs=self.aget_runs(), evaluation_results=self._evaluation_results, summary_results=aggregate_feedback_gen, + include_attachments=self._include_attachments, upload_results=self._upload_results, ) @@ -701,7 +727,11 @@ async def aget_summary_scores(self) -> Dict[str, List[dict]]: ## Private methods async def _apredict( - self, target: ATARGET_T, /, max_concurrency: Optional[int] = None + self, + target: ATARGET_T, + /, + max_concurrency: Optional[int] = None, + include_attachments: bool = False, ) -> AsyncIterator[_ForwardResults]: fn = _ensure_async_traceable(target) @@ -709,7 +739,12 @@ async def predict_all(): async for example in await self.aget_examples(): # Yield the coroutine to be awaited later yield _aforward( - fn, example, self.experiment_name, self._metadata, self.client + fn, + example, + self.experiment_name, + self._metadata, + self.client, + include_attachments, ) async for result in aitertools.aiter_with_concurrency( @@ -993,6 +1028,7 @@ async def _aforward( experiment_name: str, metadata: dict, client: langsmith.Client, + include_attachments: bool = False, ) -> _ForwardResults: run: Optional[schemas.RunBase] = None @@ -1002,8 +1038,13 @@ def _get_run(r: run_trees.RunTree) -> None: with rh.tracing_context(enabled=True): try: + args = ( + (example.inputs, example.attachments) + if include_attachments + else (example.inputs,) + ) await fn( - example.inputs, + *args, langsmith_extra=rh.LangSmithExtra( reference_example_id=example.id, on_end=_get_run, @@ -1019,6 +1060,10 @@ def _get_run(r: run_trees.RunTree) -> None: client=client, ), ) + if include_attachments and example.attachments is not None: + for attachment in example.attachments: + reader = example.attachments[attachment]["reader"] + reader.seek(0) except Exception as e: logger.error( f"Error running target function: {e}", exc_info=True, stacklevel=1 @@ -1055,17 +1100,22 @@ def _ensure_async_traceable( return target # type: ignore else: if _is_langchain_runnable(target): - target = target.ainvoke # type: ignore[attr-defined] - return rh.traceable(name="AsyncTarget")(target) + target = target.ainvoke # type: ignore[union-attr] + return rh.traceable(name="AsyncTarget")(target) # type: ignore[arg-type] def _aresolve_data( - data: Union[DATA_T, AsyncIterable[schemas.Example]], *, client: langsmith.Client + data: Union[DATA_T, AsyncIterable[schemas.Example]], + *, + client: langsmith.Client, + include_attachments: bool = False, ) -> AsyncIterator[schemas.Example]: """Return the examples for the given dataset.""" if isinstance(data, AsyncIterable): return aitertools.ensure_async_iterator(data) - return aitertools.ensure_async_iterator(_resolve_data(data, client=client)) + return aitertools.ensure_async_iterator( + _resolve_data(data, client=client, include_attachments=include_attachments) + ) T = TypeVar("T") @@ -1078,3 +1128,11 @@ async def async_chain_from_iterable( for sub_iterable in iterable: async for item in sub_iterable: yield item + + +async def async_iter_from_list( + examples: List[schemas.Example], +) -> AsyncIterable[schemas.Example]: + """Convert a list of examples to an async iterable.""" + for example in examples: + yield example diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index bf7505284..ea206b098 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -8,6 +8,7 @@ import datetime import functools import inspect +import io import itertools import logging import pathlib @@ -36,6 +37,7 @@ cast, ) +import requests from typing_extensions import TypedDict, overload import langsmith @@ -68,7 +70,7 @@ DataFrame = Any logger = logging.getLogger(__name__) -TARGET_T = Callable[[dict], dict] +TARGET_T = Union[Callable[[dict], dict], Callable[[dict, dict], dict]] # Data format: dataset-name, dataset_id, or examples DATA_T = Union[str, uuid.UUID, Iterable[schemas.Example], schemas.Dataset] # Summary evaluator runs over the whole dataset @@ -1064,6 +1066,8 @@ def _evaluate( # If provided, we don't need to create a new experiment. runs=runs, # Create or resolve the experiment. + include_attachments=_include_attachments(target) + or _evaluators_include_attachments(evaluators), upload_results=upload_results, ).start() cache_dir = ls_utils.get_cache_dir(None) @@ -1312,6 +1316,7 @@ def __init__( summary_results: Optional[Iterable[EvaluationResults]] = None, description: Optional[str] = None, num_repetitions: int = 1, + include_attachments: bool = False, upload_results: bool = True, ): super().__init__( @@ -1326,15 +1331,22 @@ def __init__( self._evaluation_results = evaluation_results self._summary_results = summary_results self._num_repetitions = num_repetitions + self._include_attachments = include_attachments self._upload_results = upload_results @property def examples(self) -> Iterable[schemas.Example]: if self._examples is None: - self._examples = _resolve_data(self._data, client=self.client) + self._examples = _resolve_data( + self._data, + client=self.client, + include_attachments=self._include_attachments, + ) if self._num_repetitions > 1: + examples_list = list(self._examples) self._examples = itertools.chain.from_iterable( - itertools.tee(self._examples, self._num_repetitions) + _make_fresh_examples(examples_list) + for _ in range(self._num_repetitions) ) self._examples, examples_iter = itertools.tee(self._examples) return examples_iter @@ -1377,6 +1389,7 @@ def start(self) -> _ExperimentManager: client=self.client, runs=self._runs, evaluation_results=self._evaluation_results, + include_attachments=self._include_attachments, upload_results=self._upload_results, ) @@ -1389,7 +1402,10 @@ def with_predictions( """Lazily apply the target function to the experiment.""" context = copy_context() _experiment_results = context.run( - self._predict, target, max_concurrency=max_concurrency + self._predict, + target, + max_concurrency=max_concurrency, + include_attachments=_include_attachments(target), ) r1, r2 = itertools.tee(_experiment_results, 2) return _ExperimentManager( @@ -1400,6 +1416,7 @@ def with_predictions( runs=(pred["run"] for pred in r2), upload_results=self._upload_results, # TODO: Can't do multiple prediction rounds rn. + include_attachments=self._include_attachments, ) def with_evaluators( @@ -1430,6 +1447,7 @@ def with_evaluators( runs=(result["run"] for result in r2), evaluation_results=(result["evaluation_results"] for result in r3), summary_results=self._summary_results, + include_attachments=self._include_attachments, upload_results=self._upload_results, ) @@ -1451,6 +1469,7 @@ def with_summary_evaluators( runs=self.runs, evaluation_results=self._evaluation_results, summary_results=aggregate_feedback_gen, + include_attachments=self._include_attachments, upload_results=self._upload_results, ) @@ -1481,10 +1500,15 @@ def get_summary_scores(self) -> Dict[str, List[dict]]: # Private methods def _predict( - self, target: TARGET_T, /, max_concurrency: Optional[int] = None + self, + target: TARGET_T, + /, + max_concurrency: Optional[int] = None, + include_attachments: bool = False, ) -> Generator[_ForwardResults, None, None]: """Run the target function on the examples.""" fn = _ensure_traceable(target) + if max_concurrency == 0: for example in self.examples: yield _forward( @@ -1494,6 +1518,7 @@ def _predict( self._metadata, self.client, self._upload_results, + include_attachments, ) else: @@ -1507,6 +1532,7 @@ def _predict( self._metadata, self.client, self._upload_results, + include_attachments, ) for example in self.examples ] @@ -1794,6 +1820,7 @@ def _forward( metadata: dict, client: langsmith.Client, upload_results: bool, + include_attachments: bool = False, ) -> _ForwardResults: run: Optional[schemas.RunBase] = None @@ -1815,7 +1842,19 @@ def _get_run(r: rt.RunTree) -> None: client=client, ) try: - fn(example.inputs, langsmith_extra=langsmith_extra) + args = ( + (example.inputs, example.attachments) + if include_attachments + else (example.inputs,) + ) + fn( + *args, + langsmith_extra=langsmith_extra, + ) + if include_attachments and example.attachments is not None: + for attachment in example.attachments: + reader = example.attachments[attachment]["reader"] + reader.seek(0) except Exception as e: logger.error( f"Error running target function: {e}", exc_info=True, stacklevel=1 @@ -1832,17 +1871,28 @@ def _is_valid_uuid(value: str) -> bool: def _resolve_data( - data: DATA_T, *, client: langsmith.Client + data: DATA_T, + *, + client: langsmith.Client, + include_attachments: bool = False, ) -> Iterable[schemas.Example]: """Return the examples for the given dataset.""" if isinstance(data, uuid.UUID): - return client.list_examples(dataset_id=data) + return client.list_examples( + dataset_id=data, include_attachments=include_attachments + ) elif isinstance(data, str) and _is_valid_uuid(data): - return client.list_examples(dataset_id=uuid.UUID(data)) + return client.list_examples( + dataset_id=uuid.UUID(data), include_attachments=include_attachments + ) elif isinstance(data, str): - return client.list_examples(dataset_name=data) + return client.list_examples( + dataset_name=data, include_attachments=include_attachments + ) elif isinstance(data, schemas.Dataset): - return client.list_examples(dataset_id=data.id) + return client.list_examples( + dataset_id=data.id, include_attachments=include_attachments + ) return data @@ -1862,6 +1912,7 @@ def _ensure_traceable( " ...\n" ")" ) + if rh.is_traceable_function(target): fn: rh.SupportsLangsmithExtra[[dict], dict] = target else: @@ -1871,6 +1922,60 @@ def _ensure_traceable( return fn +def _evaluators_include_attachments( + evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]], +) -> bool: + if evaluators is None: + return False + + def evaluator_has_attachments(evaluator: Any) -> bool: + if not callable(evaluator): + return False + sig = inspect.signature(evaluator) + params = list(sig.parameters.values()) + positional_params = [ + p for p in params if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD) + ] + return any(p.name == "attachments" for p in positional_params) + + return any(evaluator_has_attachments(e) for e in evaluators) + + +def _include_attachments( + target: Any, +) -> bool: + """Whether the target function accepts attachments.""" + if _is_langchain_runnable(target) or not callable(target): + return False + # Check function signature + sig = inspect.signature(target) + params = list(sig.parameters.values()) + positional_params = [ + p for p in params if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD) + ] + positional_no_default = [p for p in positional_params if p.default is p.empty] + + if len(positional_params) == 0: + raise ValueError( + "Target function must accept at least one positional argument (inputs)." + ) + elif len(positional_no_default) > 2: + raise ValueError( + "Target function must accept at most two " + "arguments without default values: (inputs, attachments)." + ) + elif len(positional_no_default) == 2: + if [p.name for p in positional_no_default] != ["inputs", "attachments"]: + raise ValueError( + "When passing 2 positional arguments, they must be named " + "'inputs' and 'attachments', respectively. Received: " + f"{[p.name for p in positional_no_default]}" + ) + return True + else: + return [p.name for p in positional_params[:2]] == ["inputs", "attachments"] + + def _resolve_experiment( experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]], runs: Optional[Iterable[schemas.Run]], @@ -2122,3 +2227,42 @@ def _import_langchain_runnable() -> Optional[type]: def _is_langchain_runnable(o: Any) -> bool: return bool((Runnable := _import_langchain_runnable()) and isinstance(o, Runnable)) + + +def _reset_example_attachments(example: schemas.Example) -> schemas.Example: + """Reset attachment readers for an example.""" + if not hasattr(example, "attachments") or not example.attachments: + return example + + new_attachments = {} + for key, attachment in example.attachments.items(): + response = requests.get(attachment["presigned_url"], stream=True) + response.raise_for_status() + reader = io.BytesIO(response.content) + new_attachments[key] = { + "presigned_url": attachment["presigned_url"], + "reader": reader, + } + + # Create a new Example instance with the updated attachments + return schemas.Example( + id=example.id, + created_at=example.created_at, + dataset_id=example.dataset_id, + inputs=example.inputs, + outputs=example.outputs, + metadata=example.metadata, + modified_at=example.modified_at, + runs=example.runs, + source_run_id=example.source_run_id, + attachments=new_attachments, + _host_url=example._host_url, + _tenant_id=example._tenant_id, + ) + + +def _make_fresh_examples( + _original_examples: List[schemas.Example], +) -> List[schemas.Example]: + """Create fresh copies of examples with reset readers.""" + return [_reset_example_attachments(example) for example in _original_examples] diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 02fab3b71..a1505699a 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -624,7 +624,14 @@ def _normalize_evaluator_func( Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT], Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]], ]: - supported_args = ("run", "example", "inputs", "outputs", "reference_outputs") + supported_args = ( + "run", + "example", + "inputs", + "outputs", + "reference_outputs", + "attachments", + ) sig = inspect.signature(func) positional_args = [ pname @@ -659,6 +666,7 @@ async def awrapper( "example": example, "inputs": example.inputs if example else {}, "outputs": run.outputs or {}, + "attachments": example.attachments or {} if example else {}, "reference_outputs": example.outputs or {} if example else {}, } args = (arg_map[arg] for arg in positional_args) @@ -679,6 +687,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: "example": example, "inputs": example.inputs if example else {}, "outputs": run.outputs or {}, + "attachments": example.attachments or {}, "reference_outputs": example.outputs or {} if example else {}, } args = (arg_map[arg] for arg in positional_args) diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py index 7caca8f7b..acedaf177 100644 --- a/python/langsmith/schemas.py +++ b/python/langsmith/schemas.py @@ -64,7 +64,25 @@ def my_function(bar: int, my_val: Attachment): Attachments = Dict[str, Union[Tuple[str, bytes], Attachment]] -"""Attachments associated with the run. Each entry is a tuple of (mime_type, bytes).""" +"""Attachments associated with the run. +Each entry is a tuple of (mime_type, bytes), or (mime_type, file_path)""" + + +@runtime_checkable +class BinaryIOLike(Protocol): + """Protocol for binary IO-like objects.""" + + def read(self, size: int = -1) -> bytes: + """Read function.""" + ... + + def write(self, b: bytes) -> int: + """Write function.""" + ... + + def seek(self, offset: int, whence: int = 0) -> int: + """Seek function.""" + ... class ExampleBase(BaseModel): @@ -79,6 +97,7 @@ class Config: """Configuration class for the schema.""" frozen = True + arbitrary_types_allowed = True class ExampleCreate(ExampleBase): @@ -89,6 +108,32 @@ class ExampleCreate(ExampleBase): split: Optional[Union[str, List[str]]] = None +class ExampleUploadWithAttachments(BaseModel): + """Example upload with attachments.""" + + id: Optional[UUID] + created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + inputs: Dict[str, Any] = Field(default_factory=dict) + outputs: Optional[Dict[str, Any]] = Field(default=None) + metadata: Optional[Dict[str, Any]] = Field(default=None) + split: Optional[Union[str, List[str]]] = None + attachments: Optional[Attachments] = None + + +class ExampleUpsertWithAttachments(ExampleUploadWithAttachments): + """Example create with attachments.""" + + dataset_id: UUID + + +class AttachmentInfo(TypedDict): + """Info for an attachment.""" + + presigned_url: str + reader: BinaryIOLike + # TODO: add mime type + + class Example(ExampleBase): """Example model.""" @@ -100,6 +145,9 @@ class Example(ExampleBase): modified_at: Optional[datetime] = Field(default=None) runs: List[Run] = Field(default_factory=list) source_run_id: Optional[UUID] = None + attachments: Optional[Dict[str, AttachmentInfo]] = Field(default=None) + """Dictionary with attachment names as keys and a tuple of the S3 url + and a reader of the data for the file.""" _host_url: Optional[str] = PrivateAttr(default=None) _tenant_id: Optional[UUID] = PrivateAttr(default=None) @@ -135,12 +183,24 @@ class ExampleSearch(ExampleBase): id: UUID +class AttachmentsOperations(BaseModel): + """Operations to perform on attachments.""" + + rename: Dict[str, str] = Field( + default_factory=dict, description="Mapping of old attachment names to new names" + ) + retain: List[str] = Field( + default_factory=list, description="List of attachment names to keep" + ) + + class ExampleUpdate(BaseModel): """Update class for Example.""" dataset_id: Optional[UUID] = None inputs: Optional[Dict[str, Any]] = None outputs: Optional[Dict[str, Any]] = None + attachments_operations: Optional[AttachmentsOperations] = None metadata: Optional[Dict[str, Any]] = None split: Optional[Union[str, List[str]]] = None @@ -150,6 +210,18 @@ class Config: frozen = True +class ExampleUpdateWithAttachments(ExampleUpdate): + """Example update with attachments.""" + + id: UUID + inputs: Dict[str, Any] = Field(default_factory=dict) + outputs: Optional[Dict[str, Any]] = Field(default=None) + metadata: Optional[Dict[str, Any]] = Field(default=None) + split: Optional[Union[str, List[str]]] = None + attachments: Optional[Attachments] = None + attachments_operations: Optional[AttachmentsOperations] = None + + class DataType(str, Enum): """Enum for dataset data types.""" @@ -718,6 +790,8 @@ class LangSmithInfo(BaseModel): license_expiration_time: Optional[datetime] = None """The time the license will expire.""" batch_ingest_config: Optional[BatchIngestConfig] = None + """The instance flags.""" + instance_flags: Optional[Dict[str, Any]] = None Example.update_forward_refs() @@ -1003,3 +1077,12 @@ class UsageMetadata(TypedDict): Does *not* need to sum to full output token count. Does *not* need to have all keys. """ + + +class UpsertExamplesResponse(TypedDict): + """Response object returned from the upsert_examples_multipart method.""" + + count: int + """The number of examples that were upserted.""" + example_ids: List[str] + """The ids of the examples that were upserted.""" diff --git a/python/pyproject.toml b/python/pyproject.toml index a831ff0df..5b008c34d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langsmith" -version = "0.2.1" +version = "0.2.2" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." authors = ["LangChain "] license = "MIT" diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index 9bea700cd..f5f7ba878 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -20,10 +20,20 @@ from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor from langsmith.client import ID_TYPE, Client -from langsmith.schemas import DataType +from langsmith.evaluation import aevaluate, evaluate +from langsmith.schemas import ( + AttachmentsOperations, + DataType, + Example, + ExampleUpdateWithAttachments, + ExampleUploadWithAttachments, + ExampleUpsertWithAttachments, + Run, +) from langsmith.utils import ( LangSmithConnectionError, LangSmithError, + LangSmithNotFoundError, get_env_var, ) @@ -48,7 +58,14 @@ def wait_for( @pytest.fixture def langchain_client() -> Client: get_env_var.cache_clear() - return Client() + return Client( + info={ + "instance_flags": { + "dataset_examples_multipart_enabled": True, + "examples_multipart_enabled": True, + } + }, + ) def test_datasets(langchain_client: Client) -> None: @@ -369,6 +386,184 @@ def test_error_surfaced_invalid_uri(uri: str) -> None: client.create_run("My Run", inputs={"text": "hello world"}, run_type="llm") +def test_upload_examples_multipart(langchain_client: Client): + """Test uploading examples with attachments via multipart endpoint.""" + dataset_name = "__test_upload_examples_multipart" + uuid4().hex[:4] + if langchain_client.has_dataset(dataset_name=dataset_name): + langchain_client.delete_dataset(dataset_name=dataset_name) + + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for multipart example upload", + data_type=DataType.kv, + ) + + # Test example with all fields + example_id = uuid4() + example_1 = ExampleUploadWithAttachments( + id=example_id, + inputs={"text": "hello world"}, + attachments={ + "test_file": ("text/plain", b"test content"), + }, + ) + + # Test example with minimum required fields + example_2 = ExampleUploadWithAttachments( + inputs={"text": "minimal example"}, + ) + + # Test example with outputs and multiple attachments + example_3 = ExampleUploadWithAttachments( + inputs={"text": "example with outputs"}, + outputs={"response": "test response"}, + attachments={ + "file1": ("text/plain", b"content 1"), + "file2": ("text/plain", b"content 2"), + }, + ) + + # Test uploading multiple examples at once + created_examples = langchain_client.upload_examples_multipart( + dataset_id=dataset.id, uploads=[example_1, example_2, example_3] + ) + assert created_examples["count"] == 3 + + created_example_1 = langchain_client.read_example(example_id) + assert created_example_1.inputs["text"] == "hello world" + + # Verify the examples were created correctly + examples = [ + ex + for ex in langchain_client.list_examples( + dataset_id=dataset.id, + include_attachments=True, + ) + ] + assert len(examples) == 3 + + # Verify example with ID was created with correct ID + example_with_id = [ex for ex in examples if ex.id == example_id][0] + assert example_with_id.inputs["text"] == "hello world" + assert "test_file" in example_with_id.attachments + + # Verify example with outputs and multiple attachments + example_with_outputs = next( + ex + for ex in examples + if ex.outputs and ex.outputs.get("response") == "test response" + ) + assert len(example_with_outputs.attachments) == 2 + assert "file1" in example_with_outputs.attachments + assert "file2" in example_with_outputs.attachments + + # Test uploading to non-existent dataset fails + fake_id = uuid4() + with pytest.raises(LangSmithNotFoundError): + langchain_client.upload_examples_multipart( + dataset_id=fake_id, + uploads=[ + ExampleUploadWithAttachments( + inputs={"text": "should fail"}, + ) + ], + ) + + # Clean up + langchain_client.delete_dataset(dataset_name=dataset_name) + + +def test_upsert_examples_multipart(langchain_client: Client) -> None: + """Test upserting examples with attachments via the multipart endpoint.""" + dataset_name = "__test_upsert_examples_multipart" + uuid4().hex[:4] + if langchain_client.has_dataset(dataset_name=dataset_name): + langchain_client.delete_dataset(dataset_name=dataset_name) + + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for multipart example upload", + data_type=DataType.kv, + ) + + # Test example with all fields + example_id = uuid4() + example_1 = ExampleUpsertWithAttachments( + id=example_id, + dataset_id=dataset.id, + inputs={"text": "hello world"}, + # test without outputs + attachments={ + "test_file": ("text/plain", b"test content"), + }, + ) + # Test example without id + example_2 = ExampleUpsertWithAttachments( + dataset_id=dataset.id, + inputs={"text": "foo bar"}, + outputs={"response": "baz"}, + attachments={ + "my_file": ("text/plain", b"more test content"), + }, + ) + created_examples = langchain_client.upsert_examples_multipart( + upserts=[example_1, example_2] + ) + assert created_examples["count"] == 2 + + created_example_1 = langchain_client.read_example(example_id) + assert created_example_1.inputs["text"] == "hello world" + assert created_example_1.outputs is None + + created_example_2 = langchain_client.read_example( + [id_ for id_ in created_examples["example_ids"] if id_ != str(example_id)][0] + ) + assert created_example_2.inputs["text"] == "foo bar" + assert created_example_2.outputs["response"] == "baz" + + # make sure examples were sent to the correct dataset + all_examples_in_dataset = [ + example for example in langchain_client.list_examples(dataset_id=dataset.id) + ] + assert len(all_examples_in_dataset) == 2 + + example_1_update = ExampleUpsertWithAttachments( + id=example_id, + dataset_id=dataset.id, + inputs={"text": "bar baz"}, + outputs={"response": "foo"}, + attachments={ + "my_file": ("text/plain", b"more test content"), + }, + ) + updated_examples = langchain_client.upsert_examples_multipart( + upserts=[example_1_update] + ) + assert updated_examples["count"] == 0 + # Test that adding invalid example fails + # even if valid examples are added alongside + example_3 = ExampleUpsertWithAttachments( + dataset_id=uuid4(), # not a real dataset + inputs={"text": "foo bar"}, + outputs={"response": "baz"}, + attachments={ + "my_file": ("text/plain", b"more test content"), + }, + ) + + with pytest.raises(LangSmithNotFoundError): + langchain_client.upsert_examples_multipart(upserts=[example_3]) + + all_examples_in_dataset = [ + example for example in langchain_client.list_examples(dataset_id=dataset.id) + ] + assert len(all_examples_in_dataset) == 2 + + # Throw type errors when not passing ExampleUpsertWithAttachments + with pytest.raises(ValueError): + langchain_client.upsert_examples_multipart(upserts=[{"foo": "bar"}]) + langchain_client.delete_dataset(dataset_name=dataset_name) + + def test_create_dataset(langchain_client: Client) -> None: dataset_name = "__test_create_dataset" + uuid4().hex[:4] if langchain_client.has_dataset(dataset_name=dataset_name): @@ -1020,6 +1215,349 @@ def create_encoder(*args, **kwargs): assert not caplog.records +def test_list_examples_attachments_keys(langchain_client: Client) -> None: + """Test list_examples returns same keys with and without attachments.""" + dataset_name = "__test_list_examples_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset(dataset_name=dataset_name) + + langchain_client.upload_examples_multipart( + dataset_id=dataset.id, + uploads=[ + ExampleUploadWithAttachments( + inputs={"text": "hello world"}, + outputs={"response": "hi there"}, + attachments={ + "test_file": ("text/plain", b"test content"), + }, + ) + ], + ) + + # Get examples with attachments + with_attachments = next( + langchain_client.list_examples(dataset_id=dataset.id, include_attachments=True) + ) + + # Get examples without attachments + without_attachments = next( + langchain_client.list_examples(dataset_id=dataset.id, include_attachments=False) + ) + + with_keys = set(with_attachments.dict().keys()) + without_keys = set(without_attachments.dict().keys()) + assert with_keys == without_keys, ( + f"Keys differ when include_attachments=True vs False.\n" + f"Only in with_attachments: {with_keys - without_keys}\n" + f"Only in without_attachments: {without_keys - with_keys}" + ) + + langchain_client.delete_dataset(dataset_id=dataset.id) + + +def test_evaluate_with_attachments(langchain_client: Client) -> None: + """Test evaluating examples with attachments.""" + dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4] + + # 1. Create dataset + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals with attachments", + data_type=DataType.kv, + ) + + # 2. Create example with attachments + example = ExampleUploadWithAttachments( + inputs={"question": "What is shown in the image?"}, + outputs={"answer": "test image"}, + attachments={ + "image": ("image/png", b"fake image data for testing"), + }, + ) + + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) + + def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]: + # Verify we receive the attachment data + assert "image" in attachments + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] + assert image_data.read() == b"fake image data for testing" + return {"answer": "test image"} + + def evaluator( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] + assert image_data.read() == b"fake image data for testing" + return { + "score": float( + reference_outputs.get("answer") == outputs.get("answer") # type: ignore + ) + } + + results = langchain_client.evaluate( + target, + data=dataset_name, + evaluators=[evaluator], + num_repetitions=2, + ) + + assert len(results) == 2 + for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + +def test_evaluate_with_attachments_not_in_target(langchain_client: Client) -> None: + """Test evaluating examples with attachments.""" + dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals with attachments", + data_type=DataType.kv, + ) + + example = ExampleUploadWithAttachments( + inputs={"question": "What is shown in the image?"}, + outputs={"answer": "test image"}, + attachments={ + "image": ("image/png", b"fake image data for testing"), + }, + ) + + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) + + def target(inputs: Dict[str, Any]) -> Dict[str, Any]: + return {"answer": "test image"} + + def evaluator( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] + assert image_data.read() == b"fake image data for testing" + return { + "score": float( + reference_outputs.get("answer") == outputs.get("answer") # type: ignore + ) + } + + results = langchain_client.evaluate( + target, + data=dataset_name, + evaluators=[evaluator], + num_repetitions=2, + ) + + assert len(results) == 2 + for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + +def test_evaluate_with_no_attachments(langchain_client: Client) -> None: + """Test evaluating examples without attachments using a target with attachments.""" + dataset_name = "__test_evaluate_no_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals without attachments", + data_type=DataType.kv, + ) + + # Create example using old way, attachments should be set to {} + langchain_client.create_example( + dataset_id=dataset.id, + inputs={"question": "What is 2+2?"}, + outputs={"answer": "4"}, + ) + + # Verify we can create example the new way without attachments + example = ExampleUploadWithAttachments( + inputs={"question": "What is 3+1?"}, + outputs={"answer": "4"}, + ) + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) + + def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]: + # Verify we receive an empty attachments dict + assert isinstance(attachments, dict) + assert len(attachments) == 0 + return {"answer": "4"} + + def evaluator(run: Run, example: Example) -> Dict[str, Any]: + return { + "score": float( + run.outputs.get("answer") == example.outputs.get("answer") # type: ignore + ) + } + + results = evaluate( + target, data=dataset_name, evaluators=[evaluator], client=langchain_client + ) + + assert len(results) == 2 + for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + +async def test_aevaluate_with_attachments(langchain_client: Client) -> None: + """Test evaluating examples with attachments.""" + dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals with attachments", + data_type=DataType.kv, + ) + + example = ExampleUploadWithAttachments( + inputs={"question": "What is shown in the image?"}, + outputs={"answer": "test image"}, + attachments={ + "image": ("image/png", b"fake image data for testing"), + }, + ) + + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) + + async def target( + inputs: Dict[str, Any], attachments: Dict[str, Any] + ) -> Dict[str, Any]: + # Verify we receive the attachment data + assert "image" in attachments + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] + assert image_data.read() == b"fake image data for testing" + return {"answer": "test image"} + + async def evaluator( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] + assert image_data.read() == b"fake image data for testing" + return { + "score": float( + reference_outputs.get("answer") == outputs.get("answer") # type: ignore + ) + } + + results = await langchain_client.aevaluate( + target, data=dataset_name, evaluators=[evaluator], num_repetitions=2 + ) + + assert len(results) == 2 + async for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + +async def test_aevaluate_with_attachments_not_in_target( + langchain_client: Client, +) -> None: + """Test evaluating examples with attachments.""" + dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals with attachments", + data_type=DataType.kv, + ) + + example = ExampleUploadWithAttachments( + inputs={"question": "What is shown in the image?"}, + outputs={"answer": "test image"}, + attachments={ + "image": ("image/png", b"fake image data for testing"), + }, + ) + + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) + + async def target(inputs: Dict[str, Any]) -> Dict[str, Any]: + # Verify we receive the attachment data + return {"answer": "test image"} + + async def evaluator( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] + assert image_data.read() == b"fake image data for testing" + return { + "score": float( + reference_outputs.get("answer") == outputs.get("answer") # type: ignore + ) + } + + results = await langchain_client.aevaluate( + target, data=dataset_name, evaluators=[evaluator], num_repetitions=2 + ) + + assert len(results) == 2 + async for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + +async def test_aevaluate_with_no_attachments(langchain_client: Client) -> None: + """Test evaluating examples without attachments using a target with attachments.""" + dataset_name = "__test_aevaluate_no_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for evals without attachments", + data_type=DataType.kv, + ) + + # Create example using old way, attachments should be set to {} + langchain_client.create_example( + dataset_id=dataset.id, + inputs={"question": "What is 2+2?"}, + outputs={"answer": "4"}, + ) + + # Verify we can create example the new way without attachments + example = ExampleUploadWithAttachments( + inputs={"question": "What is 3+1?"}, + outputs={"answer": "4"}, + ) + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) + + async def target( + inputs: Dict[str, Any], attachments: Dict[str, Any] + ) -> Dict[str, Any]: + # Verify we receive an empty attachments dict + assert isinstance(attachments, dict) + assert len(attachments) == 0 + return {"answer": "4"} + + async def evaluator(run: Run, example: Example) -> Dict[str, Any]: + return { + "score": float( + run.outputs.get("answer") == example.outputs.get("answer") # type: ignore + ) + } + + results = await aevaluate( + target, data=dataset_name, evaluators=[evaluator], client=langchain_client + ) + + assert len(results) == 2 + async for result in results: + assert result["evaluation_results"]["results"][0].score == 1.0 + + langchain_client.delete_dataset(dataset_name=dataset_name) + + def test_examples_length_validation(langchain_client: Client) -> None: """Test that mismatched lengths raise ValueError for create and update examples.""" dataset_name = "__test_examples_length_validation" + uuid4().hex[:4] @@ -1059,3 +1597,299 @@ def test_examples_length_validation(langchain_client: Client) -> None: # Clean up langchain_client.delete_dataset(dataset_id=dataset.id) + + +def test_update_example_with_attachments_operations(langchain_client: Client) -> None: + """Test updating an example with attachment operations.""" + dataset_name = "__test_update_example_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name=dataset_name, + description="Test dataset for updating example attachments", + ) + example_id = uuid4() + # Create example with attachments + example = ExampleUploadWithAttachments( + id=example_id, + inputs={"query": "What's in this image?"}, + outputs={"answer": "A test image"}, + attachments={ + "image1": ("image/png", b"fake image data 1"), + "image2": ("image/png", b"fake image data 2"), + }, + ) + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) + + # Update example with attachment operations to rename and retain attachments + attachments_operations = AttachmentsOperations( + rename={"image1": "renamed_image"}, + retain=["image2"], # Only keep the renamed image1, drop image2 + ) + + langchain_client.update_example( + example_id=example_id, + attachments_operations=attachments_operations, + ) + + # Verify the update + retrieved_example = langchain_client.read_example( + example_id=example_id, + ) + + # Check that only the renamed attachment exists + assert len(retrieved_example.attachments) == 2 + assert "renamed_image" in retrieved_example.attachments + assert "image2" in retrieved_example.attachments + assert "image1" not in retrieved_example.attachments + assert ( + retrieved_example.attachments["image2"]["reader"].read() == b"fake image data 2" + ) + assert ( + retrieved_example.attachments["renamed_image"]["reader"].read() + == b"fake image data 1" + ) + + # Clean up + langchain_client.delete_dataset(dataset_id=dataset.id) + + +def test_bulk_update_examples_with_attachments_operations( + langchain_client: Client, +) -> None: + """Test bulk updating examples with attachment operations.""" + dataset_name = "__test_bulk_update_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name=dataset_name, + description="Test dataset for bulk updating example attachments", + ) + + example_id1, example_id2 = uuid4(), uuid4() + # Create two examples with attachments + example1 = ExampleUploadWithAttachments( + id=example_id1, + inputs={"query": "What's in this image?"}, + outputs={"answer": "A test image 1"}, + attachments={ + "image1": ("image/png", b"fake image data 1"), + "extra": ("text/plain", b"extra data"), + }, + ) + example2 = ExampleUploadWithAttachments( + id=example_id2, + inputs={"query": "What's in this image?"}, + outputs={"answer": "A test image 2"}, + attachments={ + "image2": ("image/png", b"fake image data 2"), + "extra": ("text/plain", b"extra data"), + }, + ) + + created_examples = langchain_client.upload_examples_multipart( + dataset_id=dataset.id, + uploads=[example1, example2], + ) + assert len(created_examples["example_ids"]) == 2 + assert str(example_id1) in created_examples["example_ids"] + assert str(example_id2) in created_examples["example_ids"] + + # Update both examples with different attachment operations + attachments_operations = [ + AttachmentsOperations( + rename={"image1": "renamed_image1"}, + ), + AttachmentsOperations(retain=["extra"]), + ] + + langchain_client.update_examples( + example_ids=[example_id1, example_id2], + attachments_operations=attachments_operations, + ) + + # Verify the updates + updated_examples = list( + langchain_client.list_examples( + dataset_id=dataset.id, + example_ids=[example_id1, example_id2], + include_attachments=True, + ) + ) + + updated_example_1 = next(ex for ex in updated_examples if ex.id == example_id1) + updated_example_2 = next(ex for ex in updated_examples if ex.id == example_id2) + # Check first example + assert len(updated_example_1.attachments) == 1 + assert "renamed_image1" in updated_example_1.attachments + assert "extra" not in updated_example_1.attachments + + # Check second example + assert len(updated_example_2.attachments) == 1 + assert "extra" in updated_example_2.attachments + assert "image2" not in updated_example_2.attachments + + # Check attachment data + assert ( + updated_example_1.attachments["renamed_image1"]["reader"].read() + == b"fake image data 1" + ) + assert updated_example_2.attachments["extra"]["reader"].read() == b"extra data" + + # Clean up + langchain_client.delete_dataset(dataset_id=dataset.id) + + +def test_update_examples_multipart(langchain_client: Client) -> None: + """Test updating examples with attachments via multipart endpoint.""" + dataset_name = "__test_update_examples_multipart" + uuid4().hex[:4] + if langchain_client.has_dataset(dataset_name=dataset_name): + langchain_client.delete_dataset(dataset_name=dataset_name) + + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for multipart example updates", + data_type=DataType.kv, + ) + example_ids = [uuid4() for _ in range(2)] + + # First create some examples with attachments + example_1 = ExampleUploadWithAttachments( + id=example_ids[0], + inputs={"text": "hello world"}, + attachments={ + "file1": ("text/plain", b"original content 1"), + "file2": ("text/plain", b"original content 2"), + }, + ) + + example_2 = ExampleUploadWithAttachments( + id=example_ids[1], + inputs={"text": "second example"}, + attachments={ + "file3": ("text/plain", b"original content 3"), + "file4": ("text/plain", b"original content 4"), + }, + ) + + created_examples = langchain_client.upload_examples_multipart( + dataset_id=dataset.id, uploads=[example_1, example_2] + ) + assert created_examples["count"] == 2 + + # Now create update operations + update_1 = ExampleUpdateWithAttachments( + id=example_ids[0], + inputs={"text": "updated hello world"}, + attachments={ + "new_file1": ("text/plain", b"new content 1"), + }, + attachments_operations=AttachmentsOperations( + retain=["file1"], + ), + ) + + update_2 = ExampleUpdateWithAttachments( + id=example_ids[1], + inputs={"text": "updated second example"}, + attachments={ + "new_file2": ("text/plain", b"new content 2"), + }, + attachments_operations=AttachmentsOperations(retain=["file3"]), + ) + + # Test updating multiple examples at once + updated_examples = langchain_client.update_examples_multipart( + dataset_id=dataset.id, updates=[update_1, update_2] + ) + assert updated_examples["count"] == 2 + + # Verify the updates + updated = list( + langchain_client.list_examples( + dataset_id=dataset.id, + include_attachments=True, + ) + ) + + # Verify first example updates + example_1_updated = next(ex for ex in updated if ex.id == example_ids[0]) + assert example_1_updated.inputs["text"] == "updated hello world" + assert "file1" in example_1_updated.attachments + assert "new_file1" in example_1_updated.attachments + assert "file2" not in example_1_updated.attachments + assert ( + example_1_updated.attachments["new_file1"]["reader"].read() == b"new content 1" + ) + assert ( + example_1_updated.attachments["file1"]["reader"].read() == b"original content 1" + ) + + # Verify second example updates + example_2_updated = next(ex for ex in updated if ex.id == example_ids[1]) + assert example_2_updated.inputs["text"] == "updated second example" + assert "file3" in example_2_updated.attachments + assert "new_file2" in example_2_updated.attachments + assert "file4" not in example_2_updated.attachments + assert "file3" in example_2_updated.attachments + assert "new_file2" in example_2_updated.attachments + assert "file4" not in example_2_updated.attachments + assert ( + example_2_updated.attachments["file3"]["reader"].read() == b"original content 3" + ) + assert ( + example_2_updated.attachments["new_file2"]["reader"].read() == b"new content 2" + ) + + # Test updating non-existent example doesn't do anything + response = langchain_client.update_examples_multipart( + dataset_id=dataset.id, + updates=[ + ExampleUpdateWithAttachments( + id=uuid4(), + inputs={"text": "should fail"}, + ) + ], + ) + assert response["count"] == 0 + + # Test new attachments have priority + response = langchain_client.update_examples_multipart( + dataset_id=dataset.id, + updates=[ + ExampleUpdateWithAttachments( + id=example_ids[0], + attachments={ + "renamed_file1": ("text/plain", b"new content 1"), + }, + attachments_operations=AttachmentsOperations( + retain=["renamed_file1"], + ), + ) + ], + ) + assert response["count"] == 1 + example_1_updated = langchain_client.read_example(example_ids[0]) + assert list(example_1_updated.attachments.keys()) == ["renamed_file1"] + assert ( + example_1_updated.attachments["renamed_file1"]["reader"].read() + == b"new content 1" + ) + + # Test new attachments have priority + response = langchain_client.update_examples_multipart( + dataset_id=dataset.id, + updates=[ + ExampleUpdateWithAttachments( + id=example_ids[0], + attachments={ + "foo": ("text/plain", b"new content 1"), + }, + attachments_operations=AttachmentsOperations( + rename={"renamed_file1": "foo"}, + ), + ) + ], + ) + assert response["count"] == 1 + example_1_updated = langchain_client.read_example(example_ids[0]) + assert list(example_1_updated.attachments.keys()) == ["foo"] + + # Clean up + langchain_client.delete_dataset(dataset_id=dataset.id) diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py index 132af656a..e33d07fd5 100644 --- a/python/tests/unit_tests/evaluation/test_runner.py +++ b/python/tests/unit_tests/evaluation/test_runner.py @@ -5,19 +5,22 @@ import itertools import json import random +import re import sys import time import uuid from datetime import datetime, timezone from threading import Lock -from typing import Callable, List +from typing import Any, Callable, Dict, List, Tuple from unittest import mock from unittest.mock import MagicMock import pytest +from langchain_core.runnables import chain as as_runnable from langsmith import Client, aevaluate, evaluate from langsmith import schemas as ls_schemas +from langsmith.evaluation._runner import _include_attachments from langsmith.evaluation.evaluator import ( _normalize_comparison_evaluator_func, _normalize_evaluator_func, @@ -47,7 +50,9 @@ def request(self, verb: str, endpoint: str, *args, **kwargs): return res elif endpoint == "http://localhost:1984/examples": res = MagicMock() - res.json.return_value = [e.dict() for e in self.ds_examples] + res.json.return_value = [ + e.dict() if not isinstance(e, dict) else e for e in self.ds_examples + ] return res elif endpoint == "http://localhost:1984/sessions": res = {} # type: ignore @@ -137,14 +142,23 @@ def _wait_until(condition: Callable, timeout: int = 8): raise TimeoutError("Condition not met") -def _create_example(idx: int) -> ls_schemas.Example: +def _create_example(idx: int) -> Tuple[ls_schemas.Example, Dict[str, Any]]: + _id = uuid.uuid4() + _created_at = datetime.now(timezone.utc) return ls_schemas.Example( - id=uuid.uuid4(), + id=_id, inputs={"in": idx}, outputs={"answer": idx + 1}, dataset_id="00886375-eb2a-4038-9032-efff60309896", - created_at=datetime.now(timezone.utc), - ) + created_at=_created_at, + ), { + "id": _id, + "dataset_id": "00886375-eb2a-4038-9032-efff60309896", + "created_at": _created_at, + "inputs": {"in": idx}, + "outputs": {"answer": idx + 1}, + "attachment_urls": None, + } @pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher") @@ -160,10 +174,13 @@ def test_evaluate_results( SPLIT_SIZE = 3 NUM_REPETITIONS = 4 - ds_examples = [_create_example(i) for i in range(10)] + ds_example_responses = [_create_example(i) for i in range(10)] + ds_examples = [e[0] for e in ds_example_responses] dev_split = random.sample(ds_examples, SPLIT_SIZE) tenant_id = str(uuid.uuid4()) - fake_request = FakeRequest(ds_id, ds_name, ds_examples, tenant_id) + fake_request = FakeRequest( + ds_id, ds_name, [e[1] for e in ds_example_responses], tenant_id + ) session.request = fake_request.request client = Client( api_url="http://localhost:1984", @@ -225,6 +242,14 @@ def score_unpacked_inputs_outputs_reference(inputs, outputs, reference_outputs): ordering_of_stuff.append("evaluate") return {"score": reference_outputs["answer"]} + def score_unpacked_inputs_outputs_attachments(inputs, outputs, attachments): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + + def score_unpacked_outputs(outputs): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + def eval_float(run, example): ordering_of_stuff.append("evaluate") return 0.2 @@ -253,6 +278,8 @@ def summary_eval_outputs_reference(outputs, reference_outputs): score_value_first, score_unpacked_inputs_outputs, score_unpacked_inputs_outputs_reference, + score_unpacked_inputs_outputs_attachments, + score_unpacked_outputs, eval_float, eval_str, eval_list, @@ -387,7 +414,12 @@ def eval2(x, y, inputs): _normalize_evaluator_func(eval_) with pytest.raises(ValueError, match="Invalid evaluator function."): - evaluate((lambda x: x), data=ds_examples, evaluators=[eval_], client=client) + evaluate( + (lambda inputs: inputs), + data=ds_examples, + evaluators=[eval_], + client=client, + ) def test_evaluate_raises_for_async(): @@ -431,10 +463,13 @@ async def test_aevaluate_results( SPLIT_SIZE = 3 NUM_REPETITIONS = 4 - ds_examples = [_create_example(i) for i in range(10)] + ds_example_responses = [_create_example(i) for i in range(10)] + ds_examples = [e[0] for e in ds_example_responses] dev_split = random.sample(ds_examples, SPLIT_SIZE) tenant_id = str(uuid.uuid4()) - fake_request = FakeRequest(ds_id, ds_name, ds_examples, tenant_id) + fake_request = FakeRequest( + ds_id, ds_name, [e[1] for e in ds_example_responses], tenant_id + ) session.request = fake_request.request client = Client( api_url="http://localhost:1984", @@ -499,6 +534,14 @@ async def score_unpacked_inputs_outputs_reference( ordering_of_stuff.append("evaluate") return {"score": reference_outputs["answer"]} + async def score_unpacked_inputs_outputs_attachments(inputs, outputs, attachments): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + + async def score_unpacked_outputs(outputs): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + async def eval_float(run, example): ordering_of_stuff.append("evaluate") return 0.2 @@ -527,6 +570,8 @@ def summary_eval_outputs_reference(outputs, reference_outputs): score_value_first, score_unpacked_inputs_outputs, score_unpacked_inputs_outputs_reference, + score_unpacked_inputs_outputs_attachments, + score_unpacked_outputs, eval_float, eval_str, eval_list, @@ -658,8 +703,8 @@ async def eval2(x, y, inputs): evaluators = [eval1, eval2] - async def atarget(x): - return x + async def atarget(inputs): + return inputs for eval_ in evaluators: with pytest.raises(ValueError, match="Invalid evaluator function."): @@ -676,6 +721,242 @@ async def atarget(x): ) +@as_runnable +def nested_predict(inputs): + return {"output": "Yes"} + + +@as_runnable +def lc_predict(inputs): + return nested_predict.invoke(inputs) + + +async def async_just_inputs(inputs): + return None + + +async def async_just_inputs_with_attachments(inputs, attachments): + return None + + +async def async_extra_args(inputs, attachments, foo="bar"): + return None + + +@pytest.mark.parametrize( + "target,expected,error_msg,is_async", + [ + # Valid cases + (lambda inputs: None, False, None, False), + (lambda inputs, attachments: None, True, None, False), + (async_just_inputs, False, None, True), + (async_just_inputs_with_attachments, True, None, True), + # Invalid parameter names + ( + lambda x, y: None, + None, + re.escape( + "When passing 2 positional arguments, they must be named 'inputs' and " + "'attachments', respectively. Received: ['x', 'y']" + ), + False, + ), + ( + lambda input, attachment: None, + None, + re.escape( + "When passing 2 positional arguments, they must be named 'inputs' and " + "'attachments', respectively. Received: ['input', 'attachment']" + ), + False, + ), + # Too many parameters + ( + lambda inputs, attachments, extra: None, + None, + re.escape( + "Target function must accept at most two arguments without " + "default values: (inputs, attachments)." + ), + False, + ), + # No positional parameters + ( + lambda *, foo="bar": None, + None, + re.escape( + "Target function must accept at least one positional argument (inputs)" + ), + False, + ), + # Mixed positional and keyword + (lambda inputs, *, optional=None: None, False, None, False), + (lambda inputs, attachments, *, optional=None: None, True, None, False), + # Non-callable + ("not_a_function", False, None, False), + # Runnable + (lc_predict.invoke, False, None, False), + # Positional args with defaults + (lambda inputs, attachments, foo="bar": None, True, None, False), + (async_extra_args, True, None, True), + ], +) +def test_include_attachments(target, expected, error_msg, is_async): + """Test the _include_attachments function with various input cases.""" + try: + from langchain_core.runnables import RunnableLambda + except ImportError: + if target == "runnable": + pytest.skip("langchain-core not installed") + return + + if target == "runnable": + target = RunnableLambda(lambda x: x) + expected = False + error_msg = None + + if error_msg is not None: + with pytest.raises(ValueError, match=error_msg): + _include_attachments(target) + else: + result = _include_attachments(target) + assert result == expected + + +def valid_single_supported(inputs, *, optional=None): + return {"score": 1} + + +async def valid_single_supported_async(inputs, *, optional=None): + return {"score": 1} + + +def valid_two_arbitrary(foo, bar, *, optional=None): + return {"score": 1} + + +async def valid_two_arbitrary_async(foo, bar, *, optional=None): + return {"score": 1} + + +def valid_multiple_supported(inputs, outputs, reference_outputs, *, optional=None): + return {"score": 1} + + +async def valid_multiple_supported_async( + inputs, outputs, reference_outputs, *, optional=None +): + return {"score": 1} + + +def invalid_single_unsupported(foo, *, optional=None): + return {"score": 1} + + +async def invalid_single_unsupported_async(foo, *, optional=None): + return {"score": 1} + + +def invalid_three_args(inputs, outputs, foo, *, optional=None): + return {"score": 1} + + +async def invalid_three_args_async(inputs, outputs, foo, *, optional=None): + return {"score": 1} + + +def invalid_no_positional(*, inputs, outputs, optional=None): + return {"score": 1} + + +async def invalid_no_positional_async(*, inputs, outputs, optional=None): + return {"score": 1} + + +# Test cases that should succeed +VALID_EVALUATOR_CASES = [ + (valid_single_supported, False), + (valid_single_supported_async, True), + (valid_two_arbitrary, False), + (valid_two_arbitrary_async, True), + (valid_multiple_supported, False), + (valid_multiple_supported_async, True), +] + +# Test cases that should raise ValueError +INVALID_EVALUATOR_CASES = [ + (invalid_single_unsupported, False), + (invalid_single_unsupported_async, True), + (invalid_three_args, False), + (invalid_three_args_async, True), + (invalid_no_positional, False), + (invalid_no_positional_async, True), +] + + +def target(inputs, attachments): + return {"foo": "bar"} + + +async def atarget(inputs, attachments): + return {"foo": "bar"} + + +@pytest.mark.parametrize("func,is_async", VALID_EVALUATOR_CASES) +def test_normalize_evaluator_func_valid(func, is_async): + """Test _normalize_evaluator_func succeeds.""" + func = _normalize_evaluator_func(func) + session = mock.Mock() + ds_name = "my-dataset" + ds_id = "00886375-eb2a-4038-9032-efff60309896" + + ds_example_responses = [_create_example(i) for i in range(10)] + ds_examples = [e[0] for e in ds_example_responses] + tenant_id = str(uuid.uuid4()) + fake_request = FakeRequest( + ds_id, ds_name, [e[1] for e in ds_example_responses], tenant_id + ) + session.request = fake_request.request + client = Client(api_url="http://localhost:1984", api_key="123", session=session) + client._tenant_id = tenant_id # type: ignore + + if is_async: + asyncio.run( + aevaluate(atarget, data=ds_examples, evaluators=[func], client=client) + ) + else: + evaluate(target, data=ds_examples, evaluators=[func], client=client) + + +@pytest.mark.parametrize("func,is_async", INVALID_EVALUATOR_CASES) +def test_normalize_evaluator_func_invalid(func, is_async): + """Test _normalize_evaluator_func fails correctly.""" + with pytest.raises(ValueError, match="Invalid evaluator function"): + _normalize_evaluator_func(func) + + session = mock.Mock() + ds_name = "my-dataset" + ds_id = "00886375-eb2a-4038-9032-efff60309896" + + ds_example_responses = [_create_example(i) for i in range(10)] + ds_examples = [e[0] for e in ds_example_responses] + tenant_id = str(uuid.uuid4()) + fake_request = FakeRequest( + ds_id, ds_name, [e[1] for e in ds_example_responses], tenant_id + ) + session.request = fake_request.request + client = Client(api_url="http://localhost:1984", api_key="123", session=session) + client._tenant_id = tenant_id # type: ignore + + with pytest.raises(ValueError, match="Invalid evaluator function"): + if is_async: + asyncio.run( + aevaluate(atarget, data=ds_examples, evaluators=[func], client=client) + ) + else: + evaluate(target, data=ds_examples, evaluators=[func], client=client) + + def summary_eval_runs_examples(runs_, examples_): return {"score": len(runs_[0].dotted_order)} diff --git a/python/tests/unit_tests/test_client.py b/python/tests/unit_tests/test_client.py index 902f6f392..939aa9ad2 100644 --- a/python/tests/unit_tests/test_client.py +++ b/python/tests/unit_tests/test_client.py @@ -417,6 +417,97 @@ def test_create_run_mutate( assert outputs == {"messages": ["hi", "there"]} +@mock.patch("langsmith.client.requests.Session") +def test_upsert_examples_multipart(mock_session_cls: mock.Mock) -> None: + """Test that upsert_examples_multipart sends correct multipart data.""" + mock_session = MagicMock() + mock_response = MagicMock() + mock_response.status_code = 200 + mock_session.request.return_value = mock_response + mock_session_cls.return_value = mock_session + + client = Client( + api_url="http://localhost:1984", + api_key="123", + info={"instance_flags": {"examples_multipart_enabled": True}}, + ) + + # Create test data + example_id = uuid.uuid4() + dataset_id = uuid.uuid4() + created_at = datetime(2015, 1, 1, 0, 0, 0) + + example = ls_schemas.ExampleUpsertWithAttachments( + id=example_id, + dataset_id=dataset_id, + created_at=created_at, + inputs={"input": "test input"}, + outputs={"output": "test output"}, + metadata={"meta": "data"}, + split="train", + attachments={ + "file1": ("text/plain", b"test data"), + "file2": ls_schemas.Attachment( + mime_type="application/json", data=b'{"key": "value"}' + ), + }, + ) + client.upsert_examples_multipart(upserts=[example]) + + # Verify the request + assert mock_session.request.call_count == 1 + call_args = mock_session.request.call_args + + assert call_args[0][0] == "POST" + assert call_args[0][1].endswith("/v1/platform/examples/multipart") + + # Parse the multipart data + request_data = call_args[1]["data"] + content_type = call_args[1]["headers"]["Content-Type"] + boundary = parse_options_header(content_type)[1]["boundary"] + + parser = MultipartParser( + io.BytesIO( + request_data + if isinstance(request_data, bytes) + else request_data.to_string() + ), + boundary, + ) + parts = list(parser.parts()) + + # Verify all expected parts are present + expected_parts = { + str(example_id): { + "dataset_id": str(dataset_id), + "created_at": created_at.isoformat(), + "metadata": {"meta": "data"}, + "split": "train", + }, + f"{example_id}.inputs": {"input": "test input"}, + f"{example_id}.outputs": {"output": "test output"}, + f"{example_id}.attachment.file1": "test data", + f"{example_id}.attachment.file2": '{"key": "value"}', + } + + assert len(parts) == len(expected_parts) + + for part in parts: + name = part.name + assert name in expected_parts, f"Unexpected part: {name}" + + if name.endswith(".attachment.file1"): + assert part.value == expected_parts[name] + assert part.headers["Content-Type"] == "text/plain; length=9" + elif name.endswith(".attachment.file2"): + assert part.value == expected_parts[name] + assert part.headers["Content-Type"] == "application/json; length=16" + else: + value = json.loads(part.value) + assert value == expected_parts[name] + assert part.headers["Content-Type"] == "application/json" + + class CallTracker: def __init__(self) -> None: self.counter = 0