From 8bb0826dfe589b6c418b1b0c1733da93b549b4ee Mon Sep 17 00:00:00 2001 From: Isaac Francisco <78627776+isahers1@users.noreply.github.com> Date: Tue, 10 Dec 2024 11:54:53 -0800 Subject: [PATCH] update examples multipart (#1310) --- python/langsmith/client.py | 89 ++++- python/langsmith/schemas.py | 17 + python/tests/integration_tests/test_client.py | 306 ++++++++++++++++++ 3 files changed, 407 insertions(+), 5 deletions(-) diff --git a/python/langsmith/client.py b/python/langsmith/client.py index 0f39aa9c0..c173cc7cb 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -3464,6 +3464,7 @@ def _prepate_multipart_data( examples: Union[ List[ls_schemas.ExampleUploadWithAttachments] | List[ls_schemas.ExampleUpsertWithAttachments] + | List[ls_schemas.ExampleUpdateWithAttachments], ], include_dataset_id: bool = False, ) -> Tuple[Any, bytes]: @@ -3477,21 +3478,29 @@ def _prepate_multipart_data( dataset_id = examples[0].dataset_id for example in examples: - if not isinstance( - example, ls_schemas.ExampleUploadWithAttachments - ) and not isinstance(example, ls_schemas.ExampleUpsertWithAttachments): + if ( + not isinstance(example, ls_schemas.ExampleUploadWithAttachments) + and not isinstance(example, ls_schemas.ExampleUpsertWithAttachments) + and not isinstance(example, ls_schemas.ExampleUpdateWithAttachments) + ): raise ValueError( "The examples must be of type ExampleUploadWithAttachments" " or ExampleUpsertWithAttachments" + " or ExampleUpdateWithAttachments" ) if example.id is not None: example_id = str(example.id) else: example_id = str(uuid.uuid4()) + if isinstance(example, ls_schemas.ExampleUpdateWithAttachments): + created_at = None + else: + created_at = example.created_at + example_body = { **({"dataset_id": dataset_id} if include_dataset_id else {}), - "created_at": example.created_at, + **({"created_at": created_at} if created_at is not None else {}), } if example.metadata is not None: example_body["metadata"] = example.metadata @@ -3582,6 +3591,23 @@ def _prepate_multipart_data( ) ) + if ( + isinstance(example, ls_schemas.ExampleUpdateWithAttachments) + and example.attachments_operations + ): + attachments_operationsb = _dumps_json(example.attachments_operations) + parts.append( + ( + f"{example_id}.attachments_operations", + ( + None, + attachments_operationsb, + "application/json", + {}, + ), + ) + ) + encoder = rqtb_multipart.MultipartEncoder(parts, boundary=BOUNDARY) if encoder.len <= 20_000_000: # ~20 MB data = encoder.to_string() @@ -3590,6 +3616,38 @@ def _prepate_multipart_data( return encoder, data + def update_examples_multipart( + self, + *, + dataset_id: ID_TYPE, + updates: Optional[List[ls_schemas.ExampleUpdateWithAttachments]] = None, + ) -> ls_schemas.UpsertExamplesResponse: + """Upload examples.""" + if not (self.info.instance_flags or {}).get( + "dataset_examples_multipart_enabled", False + ): + raise ValueError( + "Your LangSmith version does not allow using the multipart examples endpoint, please update to the latest version." + ) + if updates is None: + updates = [] + + encoder, data = self._prepate_multipart_data(updates, include_dataset_id=False) + + response = self.request_with_retries( + "PATCH", + f"/v1/platform/datasets/{dataset_id}/examples", + request_kwargs={ + "data": data, + "headers": { + **self._headers, + "Content-Type": encoder.content_type, + }, + }, + ) + ls_utils.raise_for_status_with_text(response) + return response.json() + def upload_examples_multipart( self, *, @@ -4072,6 +4130,7 @@ def update_example( metadata: Optional[Dict] = None, split: Optional[str | List[str]] = None, dataset_id: Optional[ID_TYPE] = None, + attachments_operations: Optional[ls_schemas.AttachmentsOperations] = None, ) -> Dict[str, Any]: """Update a specific example. @@ -4096,12 +4155,20 @@ def update_example( Dict[str, Any] The updated example. """ + if attachments_operations is not None: + if not (self.info.instance_flags or {}).get( + "dataset_examples_multipart_enabled", False + ): + raise ValueError( + "Your LangSmith version does not allow using the attachment operations, please update to the latest version." + ) example = dict( inputs=inputs, outputs=outputs, dataset_id=dataset_id, metadata=metadata, split=split, + attachments_operations=attachments_operations, ) response = self.request_with_retries( "PATCH", @@ -4121,6 +4188,9 @@ def update_examples( metadata: Optional[Sequence[Optional[Dict]]] = None, splits: Optional[Sequence[Optional[str | List[str]]]] = None, dataset_ids: Optional[Sequence[Optional[ID_TYPE]]] = None, + attachments_operations: Optional[ + Sequence[Optional[ls_schemas.AttachmentsOperations]] + ] = None, ) -> Dict[str, Any]: """Update multiple examples. @@ -4145,12 +4215,20 @@ def update_examples( Dict[str, Any] The response from the server (specifies the number of examples updated). """ + if attachments_operations is not None: + if not (self.info.instance_flags or {}).get( + "dataset_examples_multipart_enabled", False + ): + raise ValueError( + "Your LangSmith version does not allow using the attachment operations, please update to the latest version." + ) sequence_args = { "inputs": inputs, "outputs": outputs, "metadata": metadata, "splits": splits, "dataset_ids": dataset_ids, + "attachments_operations": attachments_operations, } # Since inputs are required, we will check against them examples_len = len(example_ids) @@ -4168,8 +4246,9 @@ def update_examples( "dataset_id": dataset_id_, "metadata": metadata_, "split": split_, + "attachments_operations": attachments_operations_, } - for id_, in_, out_, metadata_, split_, dataset_id_ in zip( + for id_, in_, out_, metadata_, split_, dataset_id_, attachments_operations_ in zip( example_ids, inputs or [None] * len(example_ids), outputs or [None] * len(example_ids), diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py index 30a65a018..acedaf177 100644 --- a/python/langsmith/schemas.py +++ b/python/langsmith/schemas.py @@ -183,12 +183,24 @@ class ExampleSearch(ExampleBase): id: UUID +class AttachmentsOperations(BaseModel): + """Operations to perform on attachments.""" + + rename: Dict[str, str] = Field( + default_factory=dict, description="Mapping of old attachment names to new names" + ) + retain: List[str] = Field( + default_factory=list, description="List of attachment names to keep" + ) + + class ExampleUpdate(BaseModel): """Update class for Example.""" dataset_id: Optional[UUID] = None inputs: Optional[Dict[str, Any]] = None outputs: Optional[Dict[str, Any]] = None + attachments_operations: Optional[AttachmentsOperations] = None metadata: Optional[Dict[str, Any]] = None split: Optional[Union[str, List[str]]] = None @@ -202,7 +214,12 @@ class ExampleUpdateWithAttachments(ExampleUpdate): """Example update with attachments.""" id: UUID + inputs: Dict[str, Any] = Field(default_factory=dict) + outputs: Optional[Dict[str, Any]] = Field(default=None) + metadata: Optional[Dict[str, Any]] = Field(default=None) + split: Optional[Union[str, List[str]]] = None attachments: Optional[Attachments] = None + attachments_operations: Optional[AttachmentsOperations] = None class DataType(str, Enum): diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index bef79a594..33eec0f46 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -22,8 +22,10 @@ from langsmith.client import ID_TYPE, Client from langsmith.evaluation import evaluate from langsmith.schemas import ( + AttachmentsOperations, DataType, Example, + ExampleUpdateWithAttachments, ExampleUploadWithAttachments, ExampleUpsertWithAttachments, Run, @@ -1392,3 +1394,307 @@ def test_examples_length_validation(langchain_client: Client) -> None: # Clean up langchain_client.delete_dataset(dataset_id=dataset.id) + + +def test_update_example_with_attachments_operations(langchain_client: Client) -> None: + """Test updating an example with attachment operations.""" + dataset_name = "__test_update_example_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name=dataset_name, + description="Test dataset for updating example attachments", + ) + + # Create example with attachments + example = ExampleUploadWithAttachments( + inputs={"query": "What's in this image?"}, + outputs={"answer": "A test image"}, + attachments={ + "image1": ("image/png", b"fake image data 1"), + "image2": ("image/png", b"fake image data 2"), + }, + ) + created_example = langchain_client.upload_examples_multipart( + dataset_id=dataset.id, uploads=[example] + ) + + # Update example with attachment operations to rename and retain attachments + attachments_operations = AttachmentsOperations( + rename={"image1": "renamed_image"}, + retain=["image2"], # Only keep the renamed image1, drop image2 + ) + + langchain_client.update_example( + example_id=created_example.id, + attachments_operations=attachments_operations, + ) + + # Verify the update + retrieved_example = langchain_client.read_example( + example_id=created_example.id, + ) + + # Check that only the renamed attachment exists + assert len(retrieved_example.attachments_info) == 2 + assert "renamed_image" in retrieved_example.attachments_info + assert "image2" in retrieved_example.attachments_info + assert "image1" not in retrieved_example.attachments_info + assert ( + retrieved_example.attachments_info["image2"]["reader"].read() + == b"fake image data 2" + ) + assert ( + retrieved_example.attachments_info["renamed_image"]["reader"].read() + == b"fake image data 1" + ) + + # Clean up + langchain_client.delete_dataset(dataset_id=dataset.id) + + +def test_bulk_update_examples_with_attachments_operations( + langchain_client: Client, +) -> None: + """Test bulk updating examples with attachment operations.""" + dataset_name = "__test_bulk_update_attachments" + uuid4().hex[:4] + dataset = langchain_client.create_dataset( + dataset_name=dataset_name, + description="Test dataset for bulk updating example attachments", + ) + + # Create two examples with attachments + example1 = ExampleUploadWithAttachments( + inputs={"query": "What's in this image?"}, + outputs={"answer": "A test image 1"}, + attachments={ + "image1": ("image/png", b"fake image data 1"), + "extra": ("text/plain", b"extra data"), + }, + ) + example2 = ExampleUploadWithAttachments( + inputs={"query": "What's in this image?"}, + outputs={"answer": "A test image 2"}, + attachments={ + "image2": ("image/png", b"fake image data 2"), + "extra": ("text/plain", b"extra data"), + }, + ) + + created_examples = langchain_client.upload_examples_multipart( + dataset_id=dataset.id, + uploads=[example1, example2], + ) + example_ids = [ex.id for ex in created_examples] + + # Update both examples with different attachment operations + attachments_operations = [ + AttachmentsOperations( + rename={"image1": "renamed_image1"}, + ), + AttachmentsOperations(retain=["extra"]), + ] + + langchain_client.update_examples( + example_ids=example_ids, + attachments_operations=attachments_operations, + ) + + # Verify the updates + updated_examples = list( + langchain_client.list_examples( + dataset_id=dataset.id, + example_ids=example_ids, + include_attachments=True, + ) + ) + + # Check first example + assert len(updated_examples[0].attachments) == 1 + assert "renamed_image1" in updated_examples[0].attachments + assert "extra" not in updated_examples[0].attachments + + # Check second example + assert len(updated_examples[1].attachments) == 1 + assert "extra" in updated_examples[1].attachments + assert "image2" not in updated_examples[1].attachments + + # Check attachment data + assert ( + updated_examples[0].attachments["renamed_image1"][1].read() + == b"fake image data 1" + ) + assert updated_examples[1].attachments["extra"][1].read() == b"extra data" + + # Clean up + langchain_client.delete_dataset(dataset_id=dataset.id) + + +def test_update_examples_multipart(langchain_client: Client) -> None: + """Test updating examples with attachments via multipart endpoint.""" + dataset_name = "__test_update_examples_multipart" + uuid4().hex[:4] + if langchain_client.has_dataset(dataset_name=dataset_name): + langchain_client.delete_dataset(dataset_name=dataset_name) + + dataset = langchain_client.create_dataset( + dataset_name, + description="Test dataset for multipart example updates", + data_type=DataType.kv, + ) + + # First create some examples with attachments + example_1 = ExampleUploadWithAttachments( + inputs={"text": "hello world"}, + attachments={ + "file1": ("text/plain", b"original content 1"), + "file2": ("text/plain", b"original content 2"), + }, + ) + + example_2 = ExampleUploadWithAttachments( + inputs={"text": "second example"}, + attachments={ + "file3": ("text/plain", b"original content 3"), + "file4": ("text/plain", b"original content 4"), + }, + ) + + created_examples = langchain_client.upload_examples_multipart( + dataset_id=dataset.id, uploads=[example_1, example_2] + ) + assert created_examples["count"] == 2 + + examples = list(langchain_client.list_examples(dataset_id=dataset.id)) + example_ids = [ex.id for ex in examples] + + # Now create update operations + update_1 = ExampleUpdateWithAttachments( + id=example_ids[0], + inputs={"text": "updated hello world"}, + attachments={ + "new_file1": ("text/plain", b"new content 1"), + }, + attachments_operations=AttachmentsOperations( + rename={"file1": "renamed_file1"}, + ), + ) + + update_2 = ExampleUpdateWithAttachments( + id=example_ids[1], + inputs={"text": "updated second example"}, + attachments={ + "new_file2": ("text/plain", b"new content 2"), + }, + attachments_operations=AttachmentsOperations(retain=["file3"]), + ) + + # Test updating multiple examples at once + updated_examples = langchain_client.update_examples_multipart( + dataset_id=dataset.id, updates=[update_1, update_2] + ) + assert updated_examples["count"] == 2 + + # Verify the updates + updated = list( + langchain_client.list_examples( + dataset_id=dataset.id, + include_attachments=True, + ) + ) + + # Verify first example updates + example_1_updated = next(ex for ex in updated if ex.id == example_ids[0]) + assert example_1_updated.inputs["text"] == "updated hello world" + assert "renamed_file1" in example_1_updated.attachments_info + assert "new_file1" in example_1_updated.attachments_info + assert "file2" not in example_1_updated.attachments_info + assert ( + example_1_updated.attachments_info["renamed_file1"]["reader"].read() + == b"original content 1" + ) + assert ( + example_1_updated.attachments_info["new_file1"]["reader"].read() + == b"new content 1" + ) + + # Verify second example updates + example_2_updated = next(ex for ex in updated if ex.id == example_ids[1]) + assert example_2_updated.inputs["text"] == "updated second example" + assert "file3" in example_2_updated.attachments_info + assert "new_file2" in example_2_updated.attachments_info + assert "file4" not in example_2_updated.attachments_info + assert ( + example_2_updated.attachments_info["file3"]["reader"].read() + == b"original content 3" + ) + assert ( + example_2_updated.attachments_info["new_file2"]["reader"].read() + == b"new content 2" + ) + + # Test updating examples in different datasets fails + other_dataset = langchain_client.create_dataset( + dataset_name=dataset_name + "_other", + description="Other test dataset", + ) + with pytest.raises(ValueError, match="All examples must be in the same dataset"): + langchain_client.update_examples_multipart( + dataset_id=dataset.id, + updates=[ + ExampleUpsertWithAttachments( + id=example_ids[0], + inputs={"text": "update 1"}, + ), + ExampleUpsertWithAttachments( + id=uuid4(), + inputs={"text": "update 2"}, + ), + ], + ) + + # Test updating non-existent example fails + with pytest.raises(LangSmithNotFoundError): + langchain_client.update_examples_multipart( + dataset_id=dataset.id, + updates=[ + ExampleUpsertWithAttachments( + id=uuid4(), + inputs={"text": "should fail"}, + ) + ], + ) + + # Test updating with mismatch named attachments fails + with pytest.raises(ValueError): + langchain_client.update_examples_multipart( + dataset_id=dataset.id, + updates=[ + ExampleUpdateWithAttachments( + id=example_ids[0], + attachments={ + "renamed_file1": ("text/plain", b"new content 1"), + }, + attachments_operations=AttachmentsOperations( + retain=["renamed_file1"], + ), + ) + ], + ) + + with pytest.raises(ValueError): + langchain_client.update_examples_multipart( + dataset_id=dataset.id, + updates=[ + ExampleUpdateWithAttachments( + id=example_ids[0], + attachments={ + "foo": ("text/plain", b"new content 1"), + }, + attachments_operations=AttachmentsOperations( + rename={"renamed_file1": "foo"}, + ), + ) + ], + ) + + # Clean up + langchain_client.delete_dataset(dataset_id=dataset.id) + langchain_client.delete_dataset(dataset_id=other_dataset.id)