From 8bb0826dfe589b6c418b1b0c1733da93b549b4ee Mon Sep 17 00:00:00 2001
From: Isaac Francisco <78627776+isahers1@users.noreply.github.com>
Date: Tue, 10 Dec 2024 11:54:53 -0800
Subject: [PATCH] update examples multipart (#1310)

---
 python/langsmith/client.py                    |  89 ++++-
 python/langsmith/schemas.py                   |  17 +
 python/tests/integration_tests/test_client.py | 306 ++++++++++++++++++
 3 files changed, 407 insertions(+), 5 deletions(-)

diff --git a/python/langsmith/client.py b/python/langsmith/client.py
index 0f39aa9c0..c173cc7cb 100644
--- a/python/langsmith/client.py
+++ b/python/langsmith/client.py
@@ -3464,6 +3464,7 @@ def _prepate_multipart_data(
         examples: Union[
             List[ls_schemas.ExampleUploadWithAttachments]
             | List[ls_schemas.ExampleUpsertWithAttachments]
+            | List[ls_schemas.ExampleUpdateWithAttachments],
         ],
         include_dataset_id: bool = False,
     ) -> Tuple[Any, bytes]:
@@ -3477,21 +3478,29 @@ def _prepate_multipart_data(
             dataset_id = examples[0].dataset_id
 
         for example in examples:
-            if not isinstance(
-                example, ls_schemas.ExampleUploadWithAttachments
-            ) and not isinstance(example, ls_schemas.ExampleUpsertWithAttachments):
+            if (
+                not isinstance(example, ls_schemas.ExampleUploadWithAttachments)
+                and not isinstance(example, ls_schemas.ExampleUpsertWithAttachments)
+                and not isinstance(example, ls_schemas.ExampleUpdateWithAttachments)
+            ):
                 raise ValueError(
                     "The examples must be of type ExampleUploadWithAttachments"
                     " or ExampleUpsertWithAttachments"
+                    " or ExampleUpdateWithAttachments"
                 )
             if example.id is not None:
                 example_id = str(example.id)
             else:
                 example_id = str(uuid.uuid4())
 
+            if isinstance(example, ls_schemas.ExampleUpdateWithAttachments):
+                created_at = None
+            else:
+                created_at = example.created_at
+
             example_body = {
                 **({"dataset_id": dataset_id} if include_dataset_id else {}),
-                "created_at": example.created_at,
+                **({"created_at": created_at} if created_at is not None else {}),
             }
             if example.metadata is not None:
                 example_body["metadata"] = example.metadata
@@ -3582,6 +3591,23 @@ def _prepate_multipart_data(
                             )
                         )
 
+            if (
+                isinstance(example, ls_schemas.ExampleUpdateWithAttachments)
+                and example.attachments_operations
+            ):
+                attachments_operationsb = _dumps_json(example.attachments_operations)
+                parts.append(
+                    (
+                        f"{example_id}.attachments_operations",
+                        (
+                            None,
+                            attachments_operationsb,
+                            "application/json",
+                            {},
+                        ),
+                    )
+                )
+
         encoder = rqtb_multipart.MultipartEncoder(parts, boundary=BOUNDARY)
         if encoder.len <= 20_000_000:  # ~20 MB
             data = encoder.to_string()
@@ -3590,6 +3616,38 @@ def _prepate_multipart_data(
 
         return encoder, data
 
+    def update_examples_multipart(
+        self,
+        *,
+        dataset_id: ID_TYPE,
+        updates: Optional[List[ls_schemas.ExampleUpdateWithAttachments]] = None,
+    ) -> ls_schemas.UpsertExamplesResponse:
+        """Upload examples."""
+        if not (self.info.instance_flags or {}).get(
+            "dataset_examples_multipart_enabled", False
+        ):
+            raise ValueError(
+                "Your LangSmith version does not allow using the multipart examples endpoint, please update to the latest version."
+            )
+        if updates is None:
+            updates = []
+
+        encoder, data = self._prepate_multipart_data(updates, include_dataset_id=False)
+
+        response = self.request_with_retries(
+            "PATCH",
+            f"/v1/platform/datasets/{dataset_id}/examples",
+            request_kwargs={
+                "data": data,
+                "headers": {
+                    **self._headers,
+                    "Content-Type": encoder.content_type,
+                },
+            },
+        )
+        ls_utils.raise_for_status_with_text(response)
+        return response.json()
+
     def upload_examples_multipart(
         self,
         *,
@@ -4072,6 +4130,7 @@ def update_example(
         metadata: Optional[Dict] = None,
         split: Optional[str | List[str]] = None,
         dataset_id: Optional[ID_TYPE] = None,
+        attachments_operations: Optional[ls_schemas.AttachmentsOperations] = None,
     ) -> Dict[str, Any]:
         """Update a specific example.
 
@@ -4096,12 +4155,20 @@ def update_example(
         Dict[str, Any]
             The updated example.
         """
+        if attachments_operations is not None:
+            if not (self.info.instance_flags or {}).get(
+                "dataset_examples_multipart_enabled", False
+            ):
+                raise ValueError(
+                    "Your LangSmith version does not allow using the attachment operations, please update to the latest version."
+                )
         example = dict(
             inputs=inputs,
             outputs=outputs,
             dataset_id=dataset_id,
             metadata=metadata,
             split=split,
+            attachments_operations=attachments_operations,
         )
         response = self.request_with_retries(
             "PATCH",
@@ -4121,6 +4188,9 @@ def update_examples(
         metadata: Optional[Sequence[Optional[Dict]]] = None,
         splits: Optional[Sequence[Optional[str | List[str]]]] = None,
         dataset_ids: Optional[Sequence[Optional[ID_TYPE]]] = None,
+        attachments_operations: Optional[
+            Sequence[Optional[ls_schemas.AttachmentsOperations]]
+        ] = None,
     ) -> Dict[str, Any]:
         """Update multiple examples.
 
@@ -4145,12 +4215,20 @@ def update_examples(
         Dict[str, Any]
             The response from the server (specifies the number of examples updated).
         """
+        if attachments_operations is not None:
+            if not (self.info.instance_flags or {}).get(
+                "dataset_examples_multipart_enabled", False
+            ):
+                raise ValueError(
+                    "Your LangSmith version does not allow using the attachment operations, please update to the latest version."
+                )
         sequence_args = {
             "inputs": inputs,
             "outputs": outputs,
             "metadata": metadata,
             "splits": splits,
             "dataset_ids": dataset_ids,
+            "attachments_operations": attachments_operations,
         }
         # Since inputs are required, we will check against them
         examples_len = len(example_ids)
@@ -4168,8 +4246,9 @@ def update_examples(
                 "dataset_id": dataset_id_,
                 "metadata": metadata_,
                 "split": split_,
+                "attachments_operations": attachments_operations_,
             }
-            for id_, in_, out_, metadata_, split_, dataset_id_ in zip(
+            for id_, in_, out_, metadata_, split_, dataset_id_, attachments_operations_ in zip(
                 example_ids,
                 inputs or [None] * len(example_ids),
                 outputs or [None] * len(example_ids),
diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py
index 30a65a018..acedaf177 100644
--- a/python/langsmith/schemas.py
+++ b/python/langsmith/schemas.py
@@ -183,12 +183,24 @@ class ExampleSearch(ExampleBase):
     id: UUID
 
 
+class AttachmentsOperations(BaseModel):
+    """Operations to perform on attachments."""
+
+    rename: Dict[str, str] = Field(
+        default_factory=dict, description="Mapping of old attachment names to new names"
+    )
+    retain: List[str] = Field(
+        default_factory=list, description="List of attachment names to keep"
+    )
+
+
 class ExampleUpdate(BaseModel):
     """Update class for Example."""
 
     dataset_id: Optional[UUID] = None
     inputs: Optional[Dict[str, Any]] = None
     outputs: Optional[Dict[str, Any]] = None
+    attachments_operations: Optional[AttachmentsOperations] = None
     metadata: Optional[Dict[str, Any]] = None
     split: Optional[Union[str, List[str]]] = None
 
@@ -202,7 +214,12 @@ class ExampleUpdateWithAttachments(ExampleUpdate):
     """Example update with attachments."""
 
     id: UUID
+    inputs: Dict[str, Any] = Field(default_factory=dict)
+    outputs: Optional[Dict[str, Any]] = Field(default=None)
+    metadata: Optional[Dict[str, Any]] = Field(default=None)
+    split: Optional[Union[str, List[str]]] = None
     attachments: Optional[Attachments] = None
+    attachments_operations: Optional[AttachmentsOperations] = None
 
 
 class DataType(str, Enum):
diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
index bef79a594..33eec0f46 100644
--- a/python/tests/integration_tests/test_client.py
+++ b/python/tests/integration_tests/test_client.py
@@ -22,8 +22,10 @@
 from langsmith.client import ID_TYPE, Client
 from langsmith.evaluation import evaluate
 from langsmith.schemas import (
+    AttachmentsOperations,
     DataType,
     Example,
+    ExampleUpdateWithAttachments,
     ExampleUploadWithAttachments,
     ExampleUpsertWithAttachments,
     Run,
@@ -1392,3 +1394,307 @@ def test_examples_length_validation(langchain_client: Client) -> None:
 
     # Clean up
     langchain_client.delete_dataset(dataset_id=dataset.id)
+
+
+def test_update_example_with_attachments_operations(langchain_client: Client) -> None:
+    """Test updating an example with attachment operations."""
+    dataset_name = "__test_update_example_attachments" + uuid4().hex[:4]
+    dataset = langchain_client.create_dataset(
+        dataset_name=dataset_name,
+        description="Test dataset for updating example attachments",
+    )
+
+    # Create example with attachments
+    example = ExampleUploadWithAttachments(
+        inputs={"query": "What's in this image?"},
+        outputs={"answer": "A test image"},
+        attachments={
+            "image1": ("image/png", b"fake image data 1"),
+            "image2": ("image/png", b"fake image data 2"),
+        },
+    )
+    created_example = langchain_client.upload_examples_multipart(
+        dataset_id=dataset.id, uploads=[example]
+    )
+
+    # Update example with attachment operations to rename and retain attachments
+    attachments_operations = AttachmentsOperations(
+        rename={"image1": "renamed_image"},
+        retain=["image2"],  # Only keep the renamed image1, drop image2
+    )
+
+    langchain_client.update_example(
+        example_id=created_example.id,
+        attachments_operations=attachments_operations,
+    )
+
+    # Verify the update
+    retrieved_example = langchain_client.read_example(
+        example_id=created_example.id,
+    )
+
+    # Check that only the renamed attachment exists
+    assert len(retrieved_example.attachments_info) == 2
+    assert "renamed_image" in retrieved_example.attachments_info
+    assert "image2" in retrieved_example.attachments_info
+    assert "image1" not in retrieved_example.attachments_info
+    assert (
+        retrieved_example.attachments_info["image2"]["reader"].read()
+        == b"fake image data 2"
+    )
+    assert (
+        retrieved_example.attachments_info["renamed_image"]["reader"].read()
+        == b"fake image data 1"
+    )
+
+    # Clean up
+    langchain_client.delete_dataset(dataset_id=dataset.id)
+
+
+def test_bulk_update_examples_with_attachments_operations(
+    langchain_client: Client,
+) -> None:
+    """Test bulk updating examples with attachment operations."""
+    dataset_name = "__test_bulk_update_attachments" + uuid4().hex[:4]
+    dataset = langchain_client.create_dataset(
+        dataset_name=dataset_name,
+        description="Test dataset for bulk updating example attachments",
+    )
+
+    # Create two examples with attachments
+    example1 = ExampleUploadWithAttachments(
+        inputs={"query": "What's in this image?"},
+        outputs={"answer": "A test image 1"},
+        attachments={
+            "image1": ("image/png", b"fake image data 1"),
+            "extra": ("text/plain", b"extra data"),
+        },
+    )
+    example2 = ExampleUploadWithAttachments(
+        inputs={"query": "What's in this image?"},
+        outputs={"answer": "A test image 2"},
+        attachments={
+            "image2": ("image/png", b"fake image data 2"),
+            "extra": ("text/plain", b"extra data"),
+        },
+    )
+
+    created_examples = langchain_client.upload_examples_multipart(
+        dataset_id=dataset.id,
+        uploads=[example1, example2],
+    )
+    example_ids = [ex.id for ex in created_examples]
+
+    # Update both examples with different attachment operations
+    attachments_operations = [
+        AttachmentsOperations(
+            rename={"image1": "renamed_image1"},
+        ),
+        AttachmentsOperations(retain=["extra"]),
+    ]
+
+    langchain_client.update_examples(
+        example_ids=example_ids,
+        attachments_operations=attachments_operations,
+    )
+
+    # Verify the updates
+    updated_examples = list(
+        langchain_client.list_examples(
+            dataset_id=dataset.id,
+            example_ids=example_ids,
+            include_attachments=True,
+        )
+    )
+
+    # Check first example
+    assert len(updated_examples[0].attachments) == 1
+    assert "renamed_image1" in updated_examples[0].attachments
+    assert "extra" not in updated_examples[0].attachments
+
+    # Check second example
+    assert len(updated_examples[1].attachments) == 1
+    assert "extra" in updated_examples[1].attachments
+    assert "image2" not in updated_examples[1].attachments
+
+    # Check attachment data
+    assert (
+        updated_examples[0].attachments["renamed_image1"][1].read()
+        == b"fake image data 1"
+    )
+    assert updated_examples[1].attachments["extra"][1].read() == b"extra data"
+
+    # Clean up
+    langchain_client.delete_dataset(dataset_id=dataset.id)
+
+
+def test_update_examples_multipart(langchain_client: Client) -> None:
+    """Test updating examples with attachments via multipart endpoint."""
+    dataset_name = "__test_update_examples_multipart" + uuid4().hex[:4]
+    if langchain_client.has_dataset(dataset_name=dataset_name):
+        langchain_client.delete_dataset(dataset_name=dataset_name)
+
+    dataset = langchain_client.create_dataset(
+        dataset_name,
+        description="Test dataset for multipart example updates",
+        data_type=DataType.kv,
+    )
+
+    # First create some examples with attachments
+    example_1 = ExampleUploadWithAttachments(
+        inputs={"text": "hello world"},
+        attachments={
+            "file1": ("text/plain", b"original content 1"),
+            "file2": ("text/plain", b"original content 2"),
+        },
+    )
+
+    example_2 = ExampleUploadWithAttachments(
+        inputs={"text": "second example"},
+        attachments={
+            "file3": ("text/plain", b"original content 3"),
+            "file4": ("text/plain", b"original content 4"),
+        },
+    )
+
+    created_examples = langchain_client.upload_examples_multipart(
+        dataset_id=dataset.id, uploads=[example_1, example_2]
+    )
+    assert created_examples["count"] == 2
+
+    examples = list(langchain_client.list_examples(dataset_id=dataset.id))
+    example_ids = [ex.id for ex in examples]
+
+    # Now create update operations
+    update_1 = ExampleUpdateWithAttachments(
+        id=example_ids[0],
+        inputs={"text": "updated hello world"},
+        attachments={
+            "new_file1": ("text/plain", b"new content 1"),
+        },
+        attachments_operations=AttachmentsOperations(
+            rename={"file1": "renamed_file1"},
+        ),
+    )
+
+    update_2 = ExampleUpdateWithAttachments(
+        id=example_ids[1],
+        inputs={"text": "updated second example"},
+        attachments={
+            "new_file2": ("text/plain", b"new content 2"),
+        },
+        attachments_operations=AttachmentsOperations(retain=["file3"]),
+    )
+
+    # Test updating multiple examples at once
+    updated_examples = langchain_client.update_examples_multipart(
+        dataset_id=dataset.id, updates=[update_1, update_2]
+    )
+    assert updated_examples["count"] == 2
+
+    # Verify the updates
+    updated = list(
+        langchain_client.list_examples(
+            dataset_id=dataset.id,
+            include_attachments=True,
+        )
+    )
+
+    # Verify first example updates
+    example_1_updated = next(ex for ex in updated if ex.id == example_ids[0])
+    assert example_1_updated.inputs["text"] == "updated hello world"
+    assert "renamed_file1" in example_1_updated.attachments_info
+    assert "new_file1" in example_1_updated.attachments_info
+    assert "file2" not in example_1_updated.attachments_info
+    assert (
+        example_1_updated.attachments_info["renamed_file1"]["reader"].read()
+        == b"original content 1"
+    )
+    assert (
+        example_1_updated.attachments_info["new_file1"]["reader"].read()
+        == b"new content 1"
+    )
+
+    # Verify second example updates
+    example_2_updated = next(ex for ex in updated if ex.id == example_ids[1])
+    assert example_2_updated.inputs["text"] == "updated second example"
+    assert "file3" in example_2_updated.attachments_info
+    assert "new_file2" in example_2_updated.attachments_info
+    assert "file4" not in example_2_updated.attachments_info
+    assert (
+        example_2_updated.attachments_info["file3"]["reader"].read()
+        == b"original content 3"
+    )
+    assert (
+        example_2_updated.attachments_info["new_file2"]["reader"].read()
+        == b"new content 2"
+    )
+
+    # Test updating examples in different datasets fails
+    other_dataset = langchain_client.create_dataset(
+        dataset_name=dataset_name + "_other",
+        description="Other test dataset",
+    )
+    with pytest.raises(ValueError, match="All examples must be in the same dataset"):
+        langchain_client.update_examples_multipart(
+            dataset_id=dataset.id,
+            updates=[
+                ExampleUpsertWithAttachments(
+                    id=example_ids[0],
+                    inputs={"text": "update 1"},
+                ),
+                ExampleUpsertWithAttachments(
+                    id=uuid4(),
+                    inputs={"text": "update 2"},
+                ),
+            ],
+        )
+
+    # Test updating non-existent example fails
+    with pytest.raises(LangSmithNotFoundError):
+        langchain_client.update_examples_multipart(
+            dataset_id=dataset.id,
+            updates=[
+                ExampleUpsertWithAttachments(
+                    id=uuid4(),
+                    inputs={"text": "should fail"},
+                )
+            ],
+        )
+
+    # Test updating with mismatch named attachments fails
+    with pytest.raises(ValueError):
+        langchain_client.update_examples_multipart(
+            dataset_id=dataset.id,
+            updates=[
+                ExampleUpdateWithAttachments(
+                    id=example_ids[0],
+                    attachments={
+                        "renamed_file1": ("text/plain", b"new content 1"),
+                    },
+                    attachments_operations=AttachmentsOperations(
+                        retain=["renamed_file1"],
+                    ),
+                )
+            ],
+        )
+
+    with pytest.raises(ValueError):
+        langchain_client.update_examples_multipart(
+            dataset_id=dataset.id,
+            updates=[
+                ExampleUpdateWithAttachments(
+                    id=example_ids[0],
+                    attachments={
+                        "foo": ("text/plain", b"new content 1"),
+                    },
+                    attachments_operations=AttachmentsOperations(
+                        rename={"renamed_file1": "foo"},
+                    ),
+                )
+            ],
+        )
+
+    # Clean up
+    langchain_client.delete_dataset(dataset_id=dataset.id)
+    langchain_client.delete_dataset(dataset_id=other_dataset.id)