[python] support uploading examples with attachments and running eval…

…s on examples with attachments (#1209) Co-authored-by: Ankush Gola <[email protected]> Co-authored-by: Bagatur <[email protected]> Co-authored-by: Jake Rachleff <[email protected]> Co-authored-by: William Fu-Hinthorn <[email protected]> Co-authored-by: Ankush Gola <[email protected]>
langchain-ai · Dec 10, 2024 · 82383fe · 82383fe
1 parent 6fc9f3e
commit 82383fe
Show file tree

Hide file tree

Showing 11 changed files with 2,107 additions and 47 deletions.
diff --git a/python/bench/upload_example_with_large_file_attachment.py b/python/bench/upload_example_with_large_file_attachment.py
@@ -0,0 +1,111 @@
+import os
+import statistics
+import time
+from pathlib import Path
+from typing import Dict
+
+from langsmith import Client
+from langsmith.schemas import ExampleUpsertWithAttachments
+
+WRITE_BATCH = 10000
+
+
+def create_large_file(size: int, dir: str) -> str:
+    """Create a large file for benchmarking purposes."""
+    filename = f"large_file_{size}.txt"
+    filepath = os.path.join(dir, filename)
+
+    # delete the file if it exists
+    print("Deleting existing file...")
+    if os.path.exists(filepath):
+        os.remove(filepath)
+
+    print("Creating big file...")
+    with open(filepath, "w") as f:
+        curr_size = 0
+        while curr_size < size:
+            f.write("a" * (size - curr_size))
+            curr_size += size - curr_size
+
+    print("Done creating big file...")
+    return filepath
+
+
+DATASET_NAME = "upsert_big_file_to_dataset"
+
+
+def benchmark_big_file_upload(
+    size_bytes: int, num_examples: int, samples: int = 1
+) -> Dict:
+    """
+    Benchmark run creation with specified parameters.
+    Returns timing statistics.
+    """
+    multipart_timings = []
+
+    for _ in range(samples):
+        client = Client()
+
+        if client.has_dataset(dataset_name=DATASET_NAME):
+            client.delete_dataset(dataset_name=DATASET_NAME)
+
+        dataset = client.create_dataset(
+            DATASET_NAME,
+            description="Test dataset for big file upload",
+        )
+        large_file = create_large_file(size_bytes, "/tmp")
+        examples = [
+            ExampleUpsertWithAttachments(
+                dataset_id=dataset.id,
+                inputs={"a": 1},
+                outputs={"b": 2},
+                attachments={
+                    "bigfile": ("text/plain", Path(large_file)),
+                },
+            )
+            for _ in range(num_examples)
+        ]
+
+        multipart_start = time.perf_counter()
+        client.upsert_examples_multipart(upserts=examples)
+        multipart_elapsed = time.perf_counter() - multipart_start
+
+        multipart_timings.append(multipart_elapsed)
+
+    return {
+        "mean": statistics.mean(multipart_timings),
+        "median": statistics.median(multipart_timings),
+        "stdev": (
+            statistics.stdev(multipart_timings) if len(multipart_timings) > 1 else 0
+        ),
+        "min": min(multipart_timings),
+        "max": max(multipart_timings),
+    }
+
+
+size_bytes = 50000000
+num_examples = 10
+
+
+def main(size_bytes: int, num_examples: int = 1):
+    """
+    Run benchmarks with different combinations of parameters and report results.
+    """
+    results = benchmark_big_file_upload(size_bytes, num_examples)
+
+    print(f"\nBenchmark Results for size {size_bytes} and {num_examples} examples:")
+    print("-" * 30)
+    print(f"{'Metric':<15} {'Result':>20}")
+    print("-" * 30)
+
+    metrics = ["mean", "median", "stdev", "min", "max"]
+    for metric in metrics:
+        print(f"{results[metric]:>20.4f}")
+
+    print("-" * 30)
+    print(f"{'Throughput':<15} {num_examples / results['mean']:>20.2f} ")
+    print("(examples/second)")
+
+
+if __name__ == "__main__":
+    main(size_bytes, num_examples)
diff --git a/python/bench/upload_examples_bench.py b/python/bench/upload_examples_bench.py
@@ -0,0 +1,143 @@
+import statistics
+import time
+from typing import Dict
+from uuid import uuid4
+
+from langsmith import Client
+from langsmith.schemas import DataType, ExampleUpsertWithAttachments
+
+
+def create_large_json(length: int) -> Dict:
+    """Create a large JSON object for benchmarking purposes."""
+    large_array = [
+        {
+            "index": i,
+            "data": f"This is element number {i}",
+            "nested": {"id": i, "value": f"Nested value for element {i}"},
+        }
+        for i in range(length)
+    ]
+
+    return {
+        "name": "Huge JSON" + str(uuid4()),
+        "description": "This is a very large JSON object for benchmarking purposes.",
+        "array": large_array,
+        "metadata": {
+            "created_at": "2024-10-22T19:00:00Z",
+            "author": "Python Program",
+            "version": 1.0,
+        },
+    }
+
+
+def create_example_data(dataset_id: str, json_size: int) -> Dict:
+    """Create a single example data object."""
+    return ExampleUpsertWithAttachments(
+        **{
+            "dataset_id": dataset_id,
+            "inputs": create_large_json(json_size),
+            "outputs": create_large_json(json_size),
+        }
+    )
+
+
+DATASET_NAME = "upsert_llm_evaluator_benchmark_dataset"
+
+
+def benchmark_example_uploading(
+    num_examples: int, json_size: int, samples: int = 1
+) -> Dict:
+    """
+    Benchmark run creation with specified parameters.
+    Returns timing statistics.
+    """
+    multipart_timings, old_timings = [], []
+
+    for _ in range(samples):
+        client = Client()
+
+        if client.has_dataset(dataset_name=DATASET_NAME):
+            client.delete_dataset(dataset_name=DATASET_NAME)
+
+        dataset = client.create_dataset(
+            DATASET_NAME,
+            description="Test dataset for multipart example upload",
+            data_type=DataType.kv,
+        )
+        examples = [
+            create_example_data(dataset.id, json_size) for i in range(num_examples)
+        ]
+
+        # Old method
+        old_start = time.perf_counter()
+        # inputs = [e.inputs for e in examples]
+        # outputs = [e.outputs for e in examples]
+        # # the create_examples endpoint fails above 20mb
+        # # so this will crash with json_size > ~100
+        # client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+        old_elapsed = time.perf_counter() - old_start
+
+        # New method
+        multipart_start = time.perf_counter()
+        client.upsert_examples_multipart(upserts=examples)
+        multipart_elapsed = time.perf_counter() - multipart_start
+
+        multipart_timings.append(multipart_elapsed)
+        old_timings.append(old_elapsed)
+
+    return {
+        "old": {
+            "mean": statistics.mean(old_timings),
+            "median": statistics.median(old_timings),
+            "stdev": statistics.stdev(old_timings) if len(old_timings) > 1 else 0,
+            "min": min(old_timings),
+            "max": max(old_timings),
+        },
+        "new": {
+            "mean": statistics.mean(multipart_timings),
+            "median": statistics.median(multipart_timings),
+            "stdev": (
+                statistics.stdev(multipart_timings) if len(multipart_timings) > 1 else 0
+            ),
+            "min": min(multipart_timings),
+            "max": max(multipart_timings),
+        },
+    }
+
+
+json_size = 1000
+num_examples = 1000
+
+
+def main(json_size: int, num_examples: int):
+    """
+    Run benchmarks with different combinations of parameters and report results.
+    """
+    results = benchmark_example_uploading(
+        num_examples=num_examples, json_size=json_size
+    )
+
+    print(
+        f"\nBenchmark Results for {num_examples} examples with JSON size {json_size}:"
+    )
+    print("-" * 60)
+    print(f"{'Metric':<15} {'Old Method':>20} {'New Method':>20}")
+    print("-" * 60)
+
+    metrics = ["mean", "median", "stdev", "min", "max"]
+    for metric in metrics:
+        print(
+            f"{metric:<15} {results['old'][metric]:>20.4f} "
+            f"{results['new'][metric]:>20.4f}"
+        )
+
+    print("-" * 60)
+    print(
+        f"{'Throughput':<15} {num_examples / results['old']['mean']:>20.2f} "
+        f"{num_examples / results['new']['mean']:>20.2f}"
+    )
+    print("(examples/second)")
+
+
+if __name__ == "__main__":
+    main(json_size, num_examples)