diff --git a/.github/actions/python-integration-tests/action.yml b/.github/actions/python-integration-tests/action.yml
index d8a06a5b4..a107bc56c 100644
--- a/.github/actions/python-integration-tests/action.yml
+++ b/.github/actions/python-integration-tests/action.yml
@@ -10,6 +10,9 @@ inputs:
   openai-api-key:
     description: "OpenAI API key"
     required: false
+  anthropic-api-key:
+    description: "Anthropic API key"
+    required: false
 runs:
   using: "composite"
   steps:
@@ -30,7 +33,7 @@ runs:
     - name: Install dependencies
       run: |
         poetry install --with dev
-        poetry run pip install -U langchain
+        poetry run pip install -U langchain langchain_anthropic tiktoken rapidfuzz
       shell: bash
       working-directory: python
 
@@ -42,3 +45,13 @@ runs:
       run: make integration_tests_fast
       shell: bash
       working-directory: python
+    
+    - name: Run doctest
+      env:
+        LANGCHAIN_TRACING_V2: "true"
+        LANGCHAIN_API_KEY: ${{ inputs.langchain-api-key }}
+        OPENAI_API_KEY: ${{ inputs.openai-api-key }}
+        ANTHROPIC_API_KEY: ${{ inputs.anthropic-api-key }}
+      run: make doctest
+      shell: bash
+      working-directory: python
diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
index 23de6b5ea..fda330b7e 100644
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -49,6 +49,7 @@ jobs:
           python-version: 3.11
           langchain-api-key: ${{ secrets.LANGCHAIN_API_KEY }}
           openai-api-key: ${{ secrets.OPENAI_API_KEY }}
+          anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
 
   js_integration_test:
     name: JS Integration Test
diff --git a/python/Makefile b/python/Makefile
index 5e448866b..a50228ccc 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -12,6 +12,9 @@ integration_tests:
 integration_tests_fast:
 	poetry run pytest -n auto --durations=10 -v --cov=langsmith --cov-report=term-missing --cov-report=html --cov-config=.coveragerc tests/integration_tests
 
+doctest:
+	poetry run pytest -n auto --durations=10 --doctest-modules langsmith
+
 lint:
 	poetry run ruff .
 	poetry run mypy .
diff --git a/python/langsmith/beta/__init__.py b/python/langsmith/beta/__init__.py
index f9e152fd3..9240296a3 100644
--- a/python/langsmith/beta/__init__.py
+++ b/python/langsmith/beta/__init__.py
@@ -1,5 +1,6 @@
 """Beta functionality prone to change."""
 
 from langsmith.beta._evals import compute_test_metrics, convert_runs_to_test
+from langsmith.beta._utils import warn_beta
 
-__all__ = ["convert_runs_to_test", "compute_test_metrics"]
+__all__ = ["convert_runs_to_test", "compute_test_metrics", "warn_beta"]
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
index 0fe79c8fa..0280d390a 100644
--- a/python/langsmith/client.py
+++ b/python/langsmith/client.py
@@ -49,11 +49,12 @@
 from langsmith import env as ls_env
 from langsmith import schemas as ls_schemas
 from langsmith import utils as ls_utils
-from langsmith.evaluation import evaluator as ls_evaluator
 
 if TYPE_CHECKING:
     import pandas as pd  # type: ignore
 
+    from langsmith.evaluation import evaluator as ls_evaluator
+
 logger = logging.getLogger(__name__)
 _urllib3_logger = logging.getLogger("urllib3.connectionpool")
 
@@ -728,7 +729,9 @@ def request_with_retries(
                     args = list(e.args)
                     msg = args[1] if len(args) > 1 else ""
                     msg = msg.replace("session", "session (project)")
-                    emsg = "\n".join([args[0]] + [msg] + args[2:])
+                    emsg = "\n".join(
+                        [str(args[0])] + [msg] + [str(arg) for arg in args[2:]]
+                    )
                     raise ls_utils.LangSmithError(
                         f"Failed to {request_method} {url} in LangSmith API. {emsg}"
                     ) from e
@@ -3144,11 +3147,20 @@ def _resolve_example_id(
     def _select_eval_results(
         self,
         results: Union[ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults],
+        *,
+        fn_name: Optional[str] = None,
     ) -> List[ls_evaluator.EvaluationResult]:
+        from langsmith.evaluation import evaluator as ls_evaluator  # noqa: F811
+
         if isinstance(results, ls_evaluator.EvaluationResult):
             results_ = [results]
-        elif isinstance(results, dict) and "results" in results:
-            results_ = cast(List[ls_evaluator.EvaluationResult], results["results"])
+        elif isinstance(results, dict):
+            if "results" in results:
+                results_ = cast(List[ls_evaluator.EvaluationResult], results["results"])
+            else:
+                results_ = [
+                    ls_evaluator.EvaluationResult(**{"key": fn_name, **results})
+                ]
         else:
             raise TypeError(
                 f"Invalid evaluation result type {type(results)}."
@@ -3208,15 +3220,20 @@ def _log_evaluation_feedback(
         evaluator_response: Union[
             ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults
         ],
-        run: ls_schemas.Run,
+        run: Optional[ls_schemas.Run] = None,
         source_info: Optional[Dict[str, Any]] = None,
+        project_id: Optional[ID_TYPE] = None,
     ) -> List[ls_evaluator.EvaluationResult]:
         results = self._select_eval_results(evaluator_response)
         for res in results:
             source_info_ = source_info or {}
             if res.evaluator_info:
                 source_info_ = {**res.evaluator_info, **source_info_}
-            run_id_ = res.target_run_id if res.target_run_id else run.id
+            run_id_ = None
+            if res.target_run_id:
+                run_id_ = res.target_run_id
+            elif run is not None:
+                run_id_ = run.id
             self.create_feedback(
                 run_id_,
                 res.key,
@@ -3227,6 +3244,7 @@ def _log_evaluation_feedback(
                 source_info=source_info_,
                 source_run_id=res.source_run_id,
                 feedback_source_type=ls_schemas.FeedbackSourceType.MODEL,
+                project_id=project_id,
             )
         return results
 
diff --git a/python/langsmith/evaluation/__init__.py b/python/langsmith/evaluation/__init__.py
index 64c65c134..1dd2ecbed 100644
--- a/python/langsmith/evaluation/__init__.py
+++ b/python/langsmith/evaluation/__init__.py
@@ -1,11 +1,13 @@
 """Evaluation Helpers."""
 
+from langsmith.evaluation._runner import evaluate, evaluate_existing
 from langsmith.evaluation.evaluator import (
     EvaluationResult,
     EvaluationResults,
     RunEvaluator,
     run_evaluator,
 )
+from langsmith.evaluation.integrations._langchain import LangChainStringEvaluator
 from langsmith.evaluation.string_evaluator import StringEvaluator
 
 __all__ = [
@@ -14,4 +16,7 @@
     "EvaluationResults",
     "RunEvaluator",
     "StringEvaluator",
+    "evaluate",
+    "evaluate_existing",
+    "LangChainStringEvaluator",
 ]
diff --git a/python/langsmith/evaluation/_name_generation.py b/python/langsmith/evaluation/_name_generation.py
new file mode 100644
index 000000000..191c74632
--- /dev/null
+++ b/python/langsmith/evaluation/_name_generation.py
@@ -0,0 +1,727 @@
+import random
+
+adjectives = [
+    "abandoned",
+    "aching",
+    "advanced",
+    "ample",
+    "artistic",
+    "back",
+    "best",
+    "bold",
+    "brief",
+    "clear",
+    "cold",
+    "complicated",
+    "cooked",
+    "crazy",
+    "crushing",
+    "damp",
+    "dear",
+    "definite",
+    "dependable",
+    "diligent",
+    "drab",
+    "earnest",
+    "elderly",
+    "enchanted",
+    "essential",
+    "excellent",
+    "extraneous",
+    "fixed",
+    "flowery",
+    "formal",
+    "fresh",
+    "frosty",
+    "giving",
+    "glossy",
+    "healthy",
+    "helpful",
+    "impressionable",
+    "kind",
+    "large",
+    "left",
+    "long",
+    "loyal",
+    "mealy",
+    "memorable",
+    "monthly",
+    "new",
+    "notable",
+    "only",
+    "ordinary",
+    "passionate",
+    "perfect",
+    "pertinent",
+    "proper",
+    "puzzled",
+    "reflecting",
+    "respectful",
+    "roasted",
+    "scholarly",
+    "shiny",
+    "slight",
+    "sparkling",
+    "spotless",
+    "stupendous",
+    "sunny",
+    "tart",
+    "terrific",
+    "timely",
+    "unique",
+    "upbeat",
+    "vacant",
+    "virtual",
+    "warm",
+    "weary",
+    "whispered",
+    "worthwhile",
+    "yellow",
+]
+
+nouns = [
+    "account",
+    "acknowledgment",
+    "address",
+    "advertising",
+    "airplane",
+    "animal",
+    "appointment",
+    "arrival",
+    "artist",
+    "attachment",
+    "attitude",
+    "availability",
+    "backpack",
+    "bag",
+    "balance",
+    "bass",
+    "bean",
+    "beauty",
+    "bibliography",
+    "bill",
+    "bite",
+    "blossom",
+    "boat",
+    "book",
+    "box",
+    "boy",
+    "bread",
+    "bridge",
+    "broccoli",
+    "building",
+    "butter",
+    "button",
+    "cabbage",
+    "cake",
+    "camera",
+    "camp",
+    "candle",
+    "candy",
+    "canvas",
+    "car",
+    "card",
+    "carrot",
+    "cart",
+    "case",
+    "cat",
+    "chain",
+    "chair",
+    "chalk",
+    "chance",
+    "change",
+    "channel",
+    "character",
+    "charge",
+    "charm",
+    "chart",
+    "check",
+    "cheek",
+    "cheese",
+    "chef",
+    "cherry",
+    "chicken",
+    "child",
+    "church",
+    "circle",
+    "class",
+    "clay",
+    "click",
+    "clock",
+    "cloth",
+    "cloud",
+    "clove",
+    "club",
+    "coach",
+    "coal",
+    "coast",
+    "coat",
+    "cod",
+    "coffee",
+    "collar",
+    "color",
+    "comb",
+    "comfort",
+    "comic",
+    "committee",
+    "community",
+    "company",
+    "comparison",
+    "competition",
+    "condition",
+    "connection",
+    "control",
+    "cook",
+    "copper",
+    "copy",
+    "corn",
+    "cough",
+    "country",
+    "cover",
+    "crate",
+    "crayon",
+    "cream",
+    "creator",
+    "crew",
+    "crown",
+    "current",
+    "curtain",
+    "curve",
+    "cushion",
+    "dad",
+    "daughter",
+    "day",
+    "death",
+    "debt",
+    "decision",
+    "deer",
+    "degree",
+    "design",
+    "desire",
+    "desk",
+    "detail",
+    "development",
+    "digestion",
+    "dime",
+    "dinner",
+    "direction",
+    "dirt",
+    "discovery",
+    "discussion",
+    "disease",
+    "disgust",
+    "distance",
+    "distribution",
+    "division",
+    "doctor",
+    "dog",
+    "door",
+    "drain",
+    "drawer",
+    "dress",
+    "drink",
+    "driving",
+    "dust",
+    "ear",
+    "earth",
+    "edge",
+    "education",
+    "effect",
+    "egg",
+    "end",
+    "energy",
+    "engine",
+    "error",
+    "event",
+    "example",
+    "exchange",
+    "existence",
+    "expansion",
+    "experience",
+    "expert",
+    "eye",
+    "face",
+    "fact",
+    "fall",
+    "family",
+    "farm",
+    "father",
+    "fear",
+    "feeling",
+    "field",
+    "finger",
+    "fire",
+    "fish",
+    "flag",
+    "flight",
+    "floor",
+    "flower",
+    "fold",
+    "food",
+    "football",
+    "force",
+    "form",
+    "frame",
+    "friend",
+    "frog",
+    "fruit",
+    "fuel",
+    "furniture",
+    "game",
+    "garden",
+    "gate",
+    "girl",
+    "glass",
+    "glove",
+    "goat",
+    "gold",
+    "government",
+    "grade",
+    "grain",
+    "grass",
+    "green",
+    "grip",
+    "group",
+    "growth",
+    "guide",
+    "guitar",
+    "hair",
+    "hall",
+    "hand",
+    "harbor",
+    "harmony",
+    "hat",
+    "head",
+    "health",
+    "heart",
+    "heat",
+    "hill",
+    "history",
+    "hobbies",
+    "hole",
+    "hope",
+    "horn",
+    "horse",
+    "hospital",
+    "hour",
+    "house",
+    "humor",
+    "idea",
+    "impulse",
+    "income",
+    "increase",
+    "industry",
+    "ink",
+    "insect",
+    "instrument",
+    "insurance",
+    "interest",
+    "invention",
+    "iron",
+    "island",
+    "jelly",
+    "jet",
+    "jewel",
+    "join",
+    "judge",
+    "juice",
+    "jump",
+    "kettle",
+    "key",
+    "kick",
+    "kiss",
+    "kitten",
+    "knee",
+    "knife",
+    "knowledge",
+    "land",
+    "language",
+    "laugh",
+    "law",
+    "lead",
+    "learning",
+    "leather",
+    "leg",
+    "lettuce",
+    "level",
+    "library",
+    "lift",
+    "light",
+    "limit",
+    "line",
+    "linen",
+    "lip",
+    "liquid",
+    "list",
+    "look",
+    "loss",
+    "love",
+    "lunch",
+    "machine",
+    "man",
+    "manager",
+    "map",
+    "marble",
+    "mark",
+    "market",
+    "mass",
+    "match",
+    "meal",
+    "measure",
+    "meat",
+    "meeting",
+    "memory",
+    "metal",
+    "middle",
+    "milk",
+    "mind",
+    "mine",
+    "minute",
+    "mist",
+    "mitten",
+    "mom",
+    "money",
+    "monkey",
+    "month",
+    "moon",
+    "morning",
+    "mother",
+    "motion",
+    "mountain",
+    "mouth",
+    "muscle",
+    "music",
+    "nail",
+    "name",
+    "nation",
+    "neck",
+    "need",
+    "news",
+    "night",
+    "noise",
+    "note",
+    "number",
+    "nut",
+    "observation",
+    "offer",
+    "oil",
+    "operation",
+    "opinion",
+    "orange",
+    "order",
+    "organization",
+    "ornament",
+    "oven",
+    "page",
+    "pail",
+    "pain",
+    "paint",
+    "pan",
+    "pancake",
+    "paper",
+    "parcel",
+    "parent",
+    "part",
+    "passenger",
+    "paste",
+    "payment",
+    "peace",
+    "pear",
+    "pen",
+    "pencil",
+    "person",
+    "pest",
+    "pet",
+    "picture",
+    "pie",
+    "pin",
+    "pipe",
+    "pizza",
+    "place",
+    "plane",
+    "plant",
+    "plastic",
+    "plate",
+    "play",
+    "pleasure",
+    "plot",
+    "plough",
+    "pocket",
+    "point",
+    "poison",
+    "police",
+    "pollution",
+    "popcorn",
+    "porter",
+    "position",
+    "pot",
+    "potato",
+    "powder",
+    "power",
+    "price",
+    "print",
+    "process",
+    "produce",
+    "product",
+    "profit",
+    "property",
+    "prose",
+    "protest",
+    "pull",
+    "pump",
+    "punishment",
+    "purpose",
+    "push",
+    "quarter",
+    "question",
+    "quiet",
+    "quill",
+    "quilt",
+    "quince",
+    "rabbit",
+    "rail",
+    "rain",
+    "range",
+    "rat",
+    "rate",
+    "ray",
+    "reaction",
+    "reading",
+    "reason",
+    "record",
+    "regret",
+    "relation",
+    "religion",
+    "representative",
+    "request",
+    "respect",
+    "rest",
+    "reward",
+    "rhythm",
+    "rice",
+    "river",
+    "road",
+    "roll",
+    "room",
+    "root",
+    "rose",
+    "route",
+    "rub",
+    "rule",
+    "run",
+    "sack",
+    "sail",
+    "salt",
+    "sand",
+    "scale",
+    "scarecrow",
+    "scarf",
+    "scene",
+    "scent",
+    "school",
+    "science",
+    "scissors",
+    "screw",
+    "sea",
+    "seat",
+    "secretary",
+    "seed",
+    "selection",
+    "self",
+    "sense",
+    "servant",
+    "shade",
+    "shake",
+    "shame",
+    "shape",
+    "sheep",
+    "sheet",
+    "shelf",
+    "ship",
+    "shirt",
+    "shock",
+    "shoe",
+    "shop",
+    "show",
+    "side",
+    "sign",
+    "silk",
+    "sink",
+    "sister",
+    "size",
+    "sky",
+    "sleep",
+    "smash",
+    "smell",
+    "smile",
+    "smoke",
+    "snail",
+    "snake",
+    "sneeze",
+    "snow",
+    "soap",
+    "society",
+    "sock",
+    "soda",
+    "sofa",
+    "son",
+    "song",
+    "sort",
+    "sound",
+    "soup",
+    "space",
+    "spark",
+    "speed",
+    "sponge",
+    "spoon",
+    "spray",
+    "spring",
+    "spy",
+    "square",
+    "stamp",
+    "star",
+    "start",
+    "statement",
+    "station",
+    "steam",
+    "steel",
+    "stem",
+    "step",
+    "stew",
+    "stick",
+    "stitch",
+    "stocking",
+    "stomach",
+    "stone",
+    "stop",
+    "store",
+    "story",
+    "stove",
+    "stranger",
+    "straw",
+    "stream",
+    "street",
+    "stretch",
+    "string",
+    "structure",
+    "substance",
+    "sugar",
+    "suggestion",
+    "suit",
+    "summer",
+    "sun",
+    "support",
+    "surprise",
+    "sweater",
+    "swim",
+    "system",
+    "table",
+    "tail",
+    "talk",
+    "tank",
+    "taste",
+    "tax",
+    "tea",
+    "teaching",
+    "team",
+    "tendency",
+    "test",
+    "texture",
+    "theory",
+    "thing",
+    "thought",
+    "thread",
+    "throat",
+    "thumb",
+    "thunder",
+    "ticket",
+    "time",
+    "tin",
+    "title",
+    "toad",
+    "toe",
+    "tooth",
+    "toothpaste",
+    "touch",
+    "town",
+    "toy",
+    "trade",
+    "train",
+    "transport",
+    "tray",
+    "treatment",
+    "tree",
+    "trick",
+    "trip",
+    "trouble",
+    "trousers",
+    "truck",
+    "tub",
+    "turkey",
+    "turn",
+    "twist",
+    "umbrella",
+    "uncle",
+    "underwear",
+    "unit",
+    "use",
+    "vacation",
+    "value",
+    "van",
+    "vase",
+    "vegetable",
+    "veil",
+    "vein",
+    "verse",
+    "vessel",
+    "view",
+    "visitor",
+    "voice",
+    "volcano",
+    "walk",
+    "wall",
+    "war",
+    "wash",
+    "waste",
+    "watch",
+    "water",
+    "wave",
+    "wax",
+    "way",
+    "wealth",
+    "weather",
+    "week",
+    "weight",
+    "wheel",
+    "whip",
+    "whistle",
+    "window",
+    "wine",
+    "wing",
+    "winter",
+    "wire",
+    "wish",
+    "woman",
+    "wood",
+    "wool",
+    "word",
+    "work",
+    "worm",
+    "wound",
+    "wrist",
+    "writer",
+    "yard",
+    "yoke",
+    "zebra",
+    "zinc",
+    "zipper",
+    "zone",
+]
+
+
+def random_name() -> str:
+    """Generate a random name."""
+    adjective = random.choice(adjectives)
+    noun = random.choice(nouns)
+    number = random.randint(1, 100)
+    return f"{adjective}-{noun}-{number}"
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
new file mode 100644
index 000000000..c9563caf4
--- /dev/null
+++ b/python/langsmith/evaluation/_runner.py
@@ -0,0 +1,990 @@
+"""V2 Evaluation Interface."""
+
+from __future__ import annotations
+
+import collections
+import concurrent.futures as cf
+import datetime
+import functools
+import itertools
+import logging
+import threading
+import uuid
+from contextvars import copy_context
+from typing import (
+    Callable,
+    DefaultDict,
+    Dict,
+    Generator,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    cast,
+)
+
+from requests import HTTPError
+from typing_extensions import TypedDict
+
+import langsmith
+from langsmith import beta as ls_beta
+from langsmith import env as ls_env
+from langsmith import run_helpers as rh
+from langsmith import run_trees, schemas
+from langsmith import utils as ls_utils
+from langsmith.evaluation.evaluator import (
+    EvaluationResult,
+    EvaluationResults,
+    RunEvaluator,
+    run_evaluator,
+)
+from langsmith.evaluation.integrations import LangChainStringEvaluator
+
+logger = logging.getLogger(__name__)
+
+TARGET_T = Callable[[dict], dict]
+# Data format: dataset-name, dataset_id, or examples
+DATA_T = Union[str, uuid.UUID, Iterable[schemas.Example]]
+# Summary evaluator runs over the whole dataset
+# and reports aggregate metric(s)
+SUMMARY_EVALUATOR_T = Callable[
+    [Sequence[schemas.Run], Sequence[schemas.Example]],
+    Union[EvaluationResult, EvaluationResults],
+]
+# Row-level evaluator
+EVALUATOR_T = Union[
+    RunEvaluator,
+    Callable[[schemas.Run, Optional[schemas.Example]], EvaluationResult],
+]
+
+
+@ls_beta.warn_beta
+def evaluate(
+    target: TARGET_T,
+    /,
+    data: DATA_T,
+    evaluators: Optional[Sequence[EVALUATOR_T]] = None,
+    summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
+    metadata: Optional[dict] = None,
+    experiment_prefix: Optional[str] = None,
+    max_concurrency: Optional[int] = None,
+    client: Optional[langsmith.Client] = None,
+    blocking: bool = True,
+) -> ExperimentResults:
+    r"""Evaluate a target system or function on a given dataset.
+
+    Args:
+    target (TARGET_T): The target system or function to evaluate.
+    data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of
+        examples, or a generator of examples.
+    evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run
+        on each example. Defaults to None.
+    summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary
+        evaluators to run on the entire dataset. Defaults to None.
+    metadata (Optional[dict]): Metadata to attach to the experiment.
+        Defaults to None.
+    experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
+        Defaults to None.
+    max_concurrency (Optional[int]): The maximum number of concurrent
+        evaluations to run. Defaults to None.
+    client (Optional[langsmith.Client]): The LangSmith client to use.
+        Defaults to None.
+    blocking (bool): Whether to block until the evaluation is complete.
+        Defaults to True.
+
+    Returns:
+        ExperimentResults: The results of the evaluation.
+
+    Examples:
+        Prepare the dataset:
+
+        >>> from typing import Sequence
+        >>> from langsmith import Client
+        >>> from langsmith.evaluation import evaluate
+        >>> from langsmith.schemas import Example, Run
+        >>> client = Client()
+        >>> client.clone_public_dataset(
+        ...     "https://smith.langchain.com/public/419dcab2-1d66-4b94-8901-0357ead390df/d"
+        ... )
+        >>> dataset_name = "Evaluate Examples"
+
+        Basic usage:
+
+        >>> def accuracy(run: Run, example: Example):
+        ...     # Row-level evaluator for accuracy.
+        ...     pred = run.outputs["output"]
+        ...     expected = example.outputs["answer"]
+        ...     return {"score": expected.lower() == pred.lower()}
+        ...
+        >>> def precision(runs: Sequence[Run], examples: Sequence[Example]):
+        ...     # Experiment-level evaluator for precision.
+        ...     # TP / (TP + FP)
+        ...     predictions = [run.outputs["output"].lower() for run in runs]
+        ...     expected = [example.outputs["answer"].lower() for example in examples]
+        ...     # yes and no are the only possible answers
+        ...     tp = sum([p == e for p, e in zip(predictions, expected) if p == "yes"])
+        ...     fp = sum([p == "yes" and e == "no" for p, e in zip(predictions, expected)])
+        ...     return {"score": tp / (tp + fp)}
+        ...
+        >>> def predict(inputs: dict) -> dict:
+        ...     # This can be any function or just an API call to your app.
+        ...     return {"output": "Yes"}
+        ...
+        >>> results = evaluate(
+        ...     predict,
+        ...     data=dataset_name,
+        ...     evaluators=[accuracy],
+        ...     summary_evaluators=[precision],
+        ... ) # doctest: +ELLIPSIS
+        View the evaluation results for experiment:...
+
+        Evaluating over only a subset of the examples
+
+        >>> experiment_name = results.experiment_name
+        >>> examples = client.list_examples(dataset_name=dataset_name, limit=5)
+        >>> results = evaluate(
+        ...     predict,
+        ...     data=examples,
+        ...     evaluators=[accuracy],
+        ...     summary_evaluators=[precision],
+        ...     experiment_prefix="My Experiment",
+        ... ) # doctest: +ELLIPSIS
+        View the evaluation results for experiment:...
+
+        Streaming each prediction to more easily + eagerly debug.
+
+        >>> results = evaluate(
+        ...     predict,
+        ...     data=dataset_name,
+        ...     evaluators=[accuracy],
+        ...     summary_evaluators=[precision],
+        ...     blocking=False,
+        ... ) # doctest: +ELLIPSIS
+        View the evaluation results for experiment:...
+        >>> for i, result in enumerate(results): # doctest: +ELLIPSIS
+        ...     pass
+
+        Using the `evaluate` API with an off-the-shelf LangChain evaluator:
+
+        >>> from langsmith.evaluation import LangChainStringEvaluator
+        >>> def prepare_criteria_data(run: Run, example: Example):
+        ...     return {
+        ...         "prediction": run.outputs["output"],
+        ...         "reference": example.outputs["answer"],
+        ...         "input": str(example.inputs),
+        ...     }
+        ...
+        >>> results = evaluate(
+        ...     predict,
+        ...     data=dataset_name,
+        ...     evaluators=[
+        ...         accuracy,
+        ...         LangChainStringEvaluator("embedding_distance"),
+        ...         LangChainStringEvaluator(
+        ...             "labeled_criteria",
+        ...             config={
+        ...                 "criteria": {
+        ...                     "usefulness": "The prediction is useful if it is correct"
+        ...                                   " and/or asks a useful followup question."
+        ...                 },
+        ...             },
+        ...             prepare_data=prepare_criteria_data
+        ...         ),
+        ...     ],
+        ...     summary_evaluators=[precision],
+        ... ) # doctest: +ELLIPSIS
+        View the evaluation results for experiment:...
+
+        Evaluating a LangChain object:
+
+        >>> from langchain_core.runnables import chain as as_runnable
+        >>> @as_runnable
+        ... def nested_predict(inputs):
+        ...     return {"output": "Yes"}
+        ...
+        >>> @as_runnable
+        ... def lc_predict(inputs):
+        ...     return nested_predict.invoke(inputs)
+        ...
+        >>> results = evaluate(
+        ...     lc_predict.invoke,
+        ...     data=dataset_name,
+        ...     evaluators=[accuracy],
+        ...     summary_evaluators=[precision],
+        ... ) # doctest: +ELLIPSIS
+        View the evaluation results for experiment:...
+    """  # noqa: E501
+    return _evaluate(
+        target,
+        data=data,
+        evaluators=evaluators,
+        summary_evaluators=summary_evaluators,
+        metadata=metadata,
+        experiment_prefix=experiment_prefix,
+        max_concurrency=max_concurrency,
+        client=client,
+        blocking=blocking,
+    )
+
+
+@ls_beta.warn_beta
+def evaluate_existing(
+    experiment: Union[str, uuid.UUID],
+    /,
+    data: DATA_T,
+    evaluators: Optional[Sequence[EVALUATOR_T]] = None,
+    summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
+    metadata: Optional[dict] = None,
+    max_concurrency: Optional[int] = None,
+    client: Optional[langsmith.Client] = None,
+    load_nested: bool = False,
+    blocking: bool = True,
+) -> ExperimentResults:
+    r"""Evaluate existing experiment runs.
+
+    Args:
+        experiment (Union[str, uuid.UUID]): The identifier of the experiment to evaluate.
+        data (DATA_T): The data to use for evaluation.
+        evaluators (Optional[Sequence[EVALUATOR_T]]): Optional sequence of evaluators to use for individual run evaluation.
+        summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): Optional sequence of evaluators
+            to apply over the entire dataset.
+        metadata (Optional[dict]): Optional metadata to include in the evaluation results.
+        max_concurrency (Optional[int]): Optional maximum number of concurrent evaluations.
+        client (Optional[langsmith.Client]): Optional Langsmith client to use for evaluation.
+        load_nested: Whether to load all child runs for the experiment.
+            Default is to only load the top-level root runs.
+        blocking (bool): Whether to block until evaluation is complete.
+
+    Returns:
+        ExperimentResults: The evaluation results.
+
+    Examples:
+        >>> from langsmith.evaluation import evaluate, evaluate_existing
+        >>> dataset_name = "Evaluate Examples"
+        >>> def predict(inputs: dict) -> dict:
+        ...     # This can be any function or just an API call to your app.
+        ...     return {"output": "Yes"}
+        ...
+        >>> # First run inference on the dataset
+        ... results = evaluate(
+        ...     predict,
+        ...     data=dataset_name,
+        ... ) # doctest: +ELLIPSIS
+        View the evaluation results for experiment:...
+        >>> # Then apply evaluators to the experiment
+        ... def accuracy(run: Run, example: Example):
+        ...     # Row-level evaluator for accuracy.
+        ...     pred = run.outputs["output"]
+        ...     expected = example.outputs["answer"]
+        ...     return {"score": expected.lower() == pred.lower()}
+        ...
+        >>> def precision(runs: Sequence[Run], examples: Sequence[Example]):
+        ...     # Experiment-level evaluator for precision.
+        ...     # TP / (TP + FP)
+        ...     predictions = [run.outputs["output"].lower() for run in runs]
+        ...     expected = [example.outputs["answer"].lower() for example in examples]
+        ...     # yes and no are the only possible answers
+        ...     tp = sum([p == e for p, e in zip(predictions, expected) if p == "yes"])
+        ...     fp = sum([p == "yes" and e == "no" for p, e in zip(predictions, expected)])
+        ...     return {"score": tp / (tp + fp)}
+        >>> experiment_name = results.experiment_name # Can use the returned experiment name
+        >>> experiment_name = "My Experiment:64e6e91" # Or manually specify
+        >>> results = evaluate_existing(
+        ...     experiment_name,
+        ...     data=dataset_name,
+        ...     summary_evaluators=[precision],
+        ... ) # doctest: +ELLIPSIS
+        View the evaluation results for experiment:...
+    """  # noqa: E501
+    client = client or langsmith.Client()
+    runs = _load_traces(experiment, client, load_nested=load_nested)
+    return _evaluate(
+        runs,
+        data=data,
+        evaluators=evaluators,
+        summary_evaluators=summary_evaluators,
+        metadata=metadata,
+        max_concurrency=max_concurrency,
+        client=client,
+        blocking=blocking,
+    )
+
+
+class ExperimentResultRow(TypedDict):
+    run: schemas.Run
+    example: schemas.Example
+    evaluation_results: EvaluationResults
+
+
+class ExperimentResults:
+    """Represents the results of an evaluate() call.
+
+    This class provides an iterator interface to iterate over the experiment results
+    as they become available. It also provides methods to access the experiment name,
+    the number of results, and to wait for the results to be processed.
+
+    Methods:
+        experiment_name() -> str: Returns the name of the experiment.
+        wait() -> None: Waits for the experiment data to be processed.
+    """
+
+    def __init__(
+        self,
+        experiment_manager: _ExperimentManager,
+    ):
+        self._manager = experiment_manager
+        self._results: List[ExperimentResultRow] = []
+        self._lock = threading.RLock()
+        self._thread = threading.Thread(
+            target=lambda: self._process_data(self._manager)
+        )
+        self._thread.start()
+
+    @property
+    def experiment_name(self) -> str:
+        return self._manager.experiment_name
+
+    def __iter__(self) -> Iterator[ExperimentResultRow]:
+        processed_count = 0
+        while True:
+            with self._lock:
+                if processed_count < len(self._results):
+                    yield self._results[processed_count]
+                    processed_count += 1
+                elif not self._thread.is_alive():
+                    break
+
+    def _process_data(self, manager: _ExperimentManager) -> None:
+        tqdm = _load_tqdm()
+        results = manager.get_results()
+        for item in tqdm(results):
+            with self._lock:
+                self._results.append(item)
+        summary_scores = manager.get_summary_scores()
+        with self._lock:
+            self._summary_results = summary_scores
+
+    def __len__(self) -> int:
+        return len(self._results)
+
+    def __repr__(self) -> str:
+        return f"<ExperimentResults {self.experiment_name}>"
+
+    def wait(self) -> None:
+        """Wait for the evaluation runner to complete.
+
+        This method blocks the current thread until the evaluation runner has
+        finished its execution.
+        """
+        self._thread.join()
+
+
+## Private API
+
+
+def _is_callable(target: Union[TARGET_T, Iterable[schemas.Run]]) -> bool:
+    return callable(target) or (hasattr(target, "invoke") and callable(target.invoke))
+
+
+def _evaluate(
+    target: Union[TARGET_T, Iterable[schemas.Run]],
+    /,
+    data: DATA_T,
+    evaluators: Optional[Sequence[EVALUATOR_T]] = None,
+    summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
+    metadata: Optional[dict] = None,
+    experiment_prefix: Optional[str] = None,
+    max_concurrency: Optional[int] = None,
+    client: Optional[langsmith.Client] = None,
+    blocking: bool = True,
+) -> ExperimentResults:
+    # Initialize the experiment manager.
+    manager = _ExperimentManager(
+        data,
+        client=client,
+        metadata=metadata,
+        experiment_prefix=experiment_prefix,
+        # If provided, we don't need to create a new experiment.
+        runs=None if _is_callable(target) else cast(Iterable[schemas.Run], target),
+        # Create or resolve the experiment.
+    ).start()
+    if _is_callable(target):
+        # Add predictions to the experiment.
+        manager = manager.with_predictions(
+            cast(TARGET_T, target), max_concurrency=max_concurrency
+        )
+    if evaluators:
+        # Apply evaluators to the predictions.
+        manager = manager.with_evaluators(evaluators, max_concurrency=max_concurrency)
+    if summary_evaluators:
+        # Apply the experiment-level summary evaluators.
+        manager = manager.with_summary_evaluators(summary_evaluators)
+    # Start consuming the results.
+    results = ExperimentResults(manager)
+    if blocking:
+        # Wait for the evaluation to complete.
+        results.wait()
+    return results
+
+
+def _is_uuid(value: str) -> bool:
+    try:
+        uuid.UUID(value)
+        return True
+    except ValueError:
+        return False
+
+
+def _load_traces(
+    project: Union[str, uuid.UUID], client: langsmith.Client, load_nested: bool = False
+) -> Iterable[schemas.Run]:
+    """Load nested traces for a given project."""
+    execution_order = None if load_nested else 1
+    if isinstance(project, uuid.UUID) or _is_uuid(project):
+        runs = client.list_runs(project_id=project, execution_order=execution_order)
+    else:
+        runs = client.list_runs(project_name=project, execution_order=execution_order)
+    if not load_nested:
+        return runs
+
+    treemap: DefaultDict[uuid.UUID, List[schemas.Run]] = collections.defaultdict(list)
+    results = []
+    all_runs = {}
+    for run in runs:
+        if run.parent_run_id is not None:
+            treemap[run.parent_run_id].append(run)
+        else:
+            results.append(run)
+        all_runs[run.id] = run
+    for run_id, child_runs in treemap.items():
+        all_runs[run_id].child_runs = sorted(child_runs, key=lambda r: r.dotted_order)
+    return results
+
+
+def _load_tqdm() -> Callable[[Iterable], Iterable]:
+    try:
+        from tqdm.auto import tqdm
+    except ImportError:
+        return lambda x: x
+    return tqdm
+
+
+class _ExperimentManager:
+    """Manage the execution of experiments.
+
+    Supports lazily running predictions and evaluations in parallel to facilitate
+    result streaming and early debugging.
+
+    Args:
+        data (DATA_T): The data used for the experiment. Can be a dataset name or ID OR
+            a generator of examples.
+        runs (Optional[Iterable[schemas.Run]]): The runs associated with the experiment
+            predictions.
+        experiment (Optional[schemas.TracerSession]): The tracer session
+            associated with the experiment.
+        experiment_prefix (Optional[str]): The prefix for the experiment name.
+        metadata (Optional[dict]): Additional metadata for the experiment.
+        client (Optional[langsmith.Client]): The Langsmith client used for
+             the experiment.
+        evaluation_results (Optional[Iterable[EvaluationResults]]): The evaluation
+            sresults for the experiment.
+        summary_results (Optional[Iterable[EvaluationResults]]): The aggregate results
+            for the experiment.
+    """
+
+    def __init__(
+        self,
+        data: DATA_T,
+        /,
+        metadata: Optional[dict] = None,
+        experiment_prefix: Optional[str] = None,
+        runs: Optional[Iterable[schemas.Run]] = None,
+        experiment: Optional[schemas.TracerSession] = None,
+        client: Optional[langsmith.Client] = None,
+        evaluation_results: Optional[Iterable[EvaluationResults]] = None,
+        summary_results: Optional[Iterable[EvaluationResults]] = None,
+    ):
+        self.client = client or langsmith.Client()
+        # If we've already created or started the experiment,
+        # use that name.
+        experiment_name, experiment, runs = _resolve_experiment_name(
+            experiment,
+            experiment_prefix,
+            runs,
+            self.client,
+        )
+        self.experiment_name = experiment_name
+        self._experiment = experiment
+        self._runs = runs
+        metadata = metadata or {}
+        if not metadata.get("revision_id"):
+            metadata = {
+                "revision_id": ls_env.get_langchain_env_var_metadata().get(
+                    "revision_id"
+                ),
+                **metadata,
+            }
+        self._metadata = metadata or {}
+        self._data = data
+        self._examples: Optional[Iterable[schemas.Example]] = None
+        self._evaluation_results = evaluation_results
+        self._summary_results = summary_results
+
+    @property
+    def examples(self) -> Iterable[schemas.Example]:
+        if self._examples is None:
+            self._examples = _resolve_data(self._data, client=self.client)
+        self._examples, examples_iter = itertools.tee(self._examples)
+        return examples_iter
+
+    @property
+    def evaluation_results(self) -> Iterable[EvaluationResults]:
+        if self._evaluation_results is None:
+            return [{"results": []} for _ in self.examples]
+        return self._evaluation_results
+
+    @property
+    def runs(self) -> Iterable[schemas.Run]:
+        if self._runs is None:
+            raise ValueError(
+                "Runs not provided in this experiment." " Please predict first."
+            )
+        self._runs, runs_iter = itertools.tee(self._runs)
+        return runs_iter
+
+    def start(self) -> _ExperimentManager:
+        first_example = next(itertools.islice(self.examples, 1))
+        _examples = itertools.chain([first_example], self.examples)
+        if self._experiment is None:
+            try:
+                project_metadata = self._metadata or {}
+                git_info = ls_env.get_git_info()
+                if git_info:
+                    project_metadata = {
+                        **project_metadata,
+                        "git": git_info,
+                    }
+                project = self.client.create_project(
+                    self.experiment_name,
+                    reference_dataset_id=first_example.dataset_id,
+                    metadata=project_metadata,
+                )
+            except (HTTPError, ValueError, ls_utils.LangSmithError) as e:
+                if "already exists " not in str(e):
+                    raise e
+                raise ValueError(
+                    # TODO: Better error
+                    f"Experiment {self.experiment_name} already exists."
+                    " Please use a different name."
+                )
+        else:
+            project = self._experiment
+        if project.url:
+            # TODO: Make this a public API
+            project_url = project.url.split("?")[0]
+            dataset_id = first_example.dataset_id
+            base_url = project_url.split("/projects/p/")[0]
+            comparison_url = (
+                f"{base_url}/datasets/{dataset_id}/compare?"
+                f"selectedSessions={project.id}"
+            )
+            print(  # noqa: T201
+                f"View the evaluation results for experiment: '{self.experiment_name}'"
+                f" at:\n{comparison_url}\n\n"
+            )
+        else:
+            # HACKHACK
+            print("Starting evaluation of experiment: %s", self.experiment_name)
+        return _ExperimentManager(
+            _examples,
+            experiment=project,
+            metadata=self._metadata,
+            client=self.client,
+            runs=self._runs,
+            evaluation_results=self._evaluation_results,
+        )
+
+    def with_predictions(
+        self,
+        target: TARGET_T,
+        /,
+        max_concurrency: Optional[int] = None,
+    ) -> _ExperimentManager:
+        """Lazily apply the target function to the experiment."""
+        context = copy_context()
+        _experiment_results = context.run(
+            self._predict, target, max_concurrency=max_concurrency
+        )
+        r1, r2 = itertools.tee(_experiment_results, 2)
+        return _ExperimentManager(
+            (pred["example"] for pred in r1),
+            experiment=self._experiment,
+            metadata=self._metadata,
+            client=self.client,
+            runs=(pred["run"] for pred in r2),
+            # TODO: Can't do multiple prediction rounds rn.
+        )
+
+    def with_evaluators(
+        self,
+        evaluators: Sequence[
+            Union[
+                EVALUATOR_T,
+                RunEvaluator,
+            ]
+        ],
+        *,
+        max_concurrency: Optional[int] = None,
+    ) -> _ExperimentManager:
+        """Lazily apply the provided evaluators to the experiment."""
+        evaluators = _resolve_evaluators(evaluators)
+        context = copy_context()
+        experiment_results = context.run(
+            self._score, evaluators, max_concurrency=max_concurrency
+        )
+        # Split the generator into three so the manager
+        # can consume each value individually.
+        r1, r2, r3 = itertools.tee(experiment_results, 3)
+        return _ExperimentManager(
+            (result["example"] for result in r1),
+            experiment=self._experiment,
+            metadata=self._metadata,
+            client=self.client,
+            runs=(result["run"] for result in r2),
+            evaluation_results=(result["evaluation_results"] for result in r3),
+            summary_results=self._summary_results,
+        )
+
+    def with_summary_evaluators(
+        self,
+        summary_evaluators: Sequence[SUMMARY_EVALUATOR_T],
+    ) -> _ExperimentManager:
+        """Lazily apply the provided summary evaluators to the experiment."""
+        wrapped_evaluators = _wrap_summary_evaluators(summary_evaluators)
+        context = copy_context()
+        aggregate_feedback_gen = context.run(
+            self._apply_summary_evaluators, wrapped_evaluators
+        )
+        return _ExperimentManager(
+            self.examples,
+            experiment=self._experiment,
+            metadata=self._metadata,
+            client=self.client,
+            runs=self.runs,
+            evaluation_results=self._evaluation_results,
+            summary_results=aggregate_feedback_gen,
+        )
+
+    def get_results(self) -> Iterable[ExperimentResultRow]:
+        """Return the traces, evaluation results, and associated examples."""
+        for run, example, evaluation_results in zip(
+            self.runs, self.examples, self.evaluation_results
+        ):
+            yield ExperimentResultRow(
+                run=run,
+                example=example,
+                evaluation_results=evaluation_results,
+            )
+
+    def get_summary_scores(self) -> Dict[str, List[dict]]:
+        """If summary_evaluators were applied, consume and return the results."""
+        if self._summary_results is None:
+            return {"results": []}
+        # Consume the generator
+        return {
+            "results": [
+                res for results in self._summary_results for res in results["results"]
+            ]
+        }
+
+    # Private methods.
+
+    def _get_experiment(self) -> schemas.TracerSession:
+        if self._experiment is None:
+            raise ValueError("Experiment not started yet.")
+        return self._experiment
+
+    def _end(self) -> None:
+        experiment = self._experiment
+        if experiment is None:
+            raise ValueError("Experiment not started yet.")
+        examples = list(self.examples)
+        modified_at = [ex.modified_at for ex in examples if ex.modified_at]
+        # Should always be defined in practice when fetched,
+        # but the typing permits None
+        max_modified_at = max(modified_at) if modified_at else None
+
+        self.client.update_project(
+            experiment.id,
+            end_time=datetime.datetime.now(datetime.timezone.utc),
+            metadata={
+                "dataset_version": (
+                    max_modified_at.isoformat() if max_modified_at else None
+                )
+            },
+        )
+
+    def _predict(
+        self, target: TARGET_T, /, max_concurrency: Optional[int] = None
+    ) -> Generator[_ForwardResults, None, None]:
+        """Run the target function on the examples."""
+        fn = _ensure_traceable(target)
+        if max_concurrency == 0:
+            for example in self.examples:
+                yield _forward(
+                    fn, example, self.experiment_name, self._metadata, self.client
+                )
+
+        else:
+            with cf.ThreadPoolExecutor(max_concurrency) as executor:
+                futures = [
+                    executor.submit(
+                        _forward,
+                        fn,
+                        example,
+                        self.experiment_name,
+                        self._metadata,
+                        self.client,
+                    )
+                    for example in self.examples
+                ]
+                for future in cf.as_completed(futures):
+                    yield future.result()
+        # Close out the project.
+        self._end()
+
+    def _run_evaluators(
+        self,
+        evaluators: Sequence[RunEvaluator],
+        current_results: ExperimentResultRow,
+    ) -> ExperimentResultRow:
+        current_context = rh.get_tracing_context()
+        metadata = {
+            **(current_context["metadata"] or {}),
+            **{"experiment": self.experiment_name},
+        }
+        with rh.tracing_context(
+            **{**current_context, "project_name": "evaluators", "metadata": metadata}
+        ):
+            run = current_results["run"]
+            example = current_results["example"]
+            eval_results = current_results["evaluation_results"]
+            for evaluator in evaluators:
+                try:
+                    evaluator_response = evaluator.evaluate_run(
+                        run=run,
+                        example=example,
+                    )
+                    eval_results["results"].extend(
+                        # TODO: This is a hack
+                        self.client._log_evaluation_feedback(
+                            evaluator_response,
+                            run=run,
+                        )
+                    )
+                except Exception as e:
+                    logger.error(
+                        f"Error running evaluator {repr(evaluator)} on"
+                        f" run {run.id}: {repr(e)}",
+                        exc_info=True,
+                    )
+            return ExperimentResultRow(
+                run=run,
+                example=example,
+                evaluation_results=eval_results,
+            )
+
+    def _score(
+        self,
+        evaluators: Sequence[RunEvaluator],
+        max_concurrency: Optional[int] = None,
+    ) -> Iterable[ExperimentResultRow]:
+        """Run the evaluators on the prediction stream.
+
+        Expects runs to be available in the manager.
+        (e.g. from a previous prediction step)
+        """
+        if max_concurrency == 0:
+            for current_results in self.get_results():
+                yield self._run_evaluators(evaluators, current_results)
+        else:
+            with cf.ThreadPoolExecutor(max_workers=max_concurrency) as executor:
+                futures = []
+                for current_results in self.get_results():
+                    futures.append(
+                        executor.submit(
+                            self._run_evaluators,
+                            evaluators,
+                            current_results,
+                        )
+                    )
+                for future in cf.as_completed(futures):
+                    result = future.result()
+                    yield result
+
+    def _apply_summary_evaluators(
+        self, summary_evaluators: Sequence[SUMMARY_EVALUATOR_T]
+    ) -> Generator[EvaluationResults, None, None]:
+        runs, examples = [], []
+        for run, example in zip(self.runs, self.examples):
+            runs.append(run)
+            examples.append(example)
+        aggregate_feedback = []
+        with cf.ThreadPoolExecutor() as executor:
+            project_id = self._get_experiment().id
+            for evaluator in summary_evaluators:
+                try:
+                    summary_eval_result = evaluator(runs, examples)
+                    # TODO: Expose public API for this.
+                    flattened_results = self.client._select_eval_results(
+                        summary_eval_result,
+                        fn_name=evaluator.__name__,
+                    )
+                    aggregate_feedback.extend(flattened_results)
+                    for result in flattened_results:
+                        feedback = result.dict(exclude={"target_run_id"})
+                        evaluator_info = feedback.pop("evaluator_info", None)
+                        executor.submit(
+                            self.client.create_feedback,
+                            **feedback,
+                            run_id=None,
+                            project_id=project_id,
+                            source_info=evaluator_info,
+                        )
+                except Exception as e:
+                    logger.error(
+                        f"Error running summary evaluator {repr(evaluator)}: {e}"
+                    )
+        yield {"results": aggregate_feedback}
+
+
+def _resolve_evaluators(
+    evaluators: Sequence[EVALUATOR_T],
+) -> Sequence[RunEvaluator]:
+    results = []
+    for evaluator in evaluators:
+        if isinstance(evaluator, RunEvaluator):
+            results.append(evaluator)
+        elif isinstance(evaluator, LangChainStringEvaluator):
+            results.append(evaluator.as_run_evaluator())
+        else:
+            results.append(run_evaluator(evaluator))
+    return results
+
+
+def _wrap_summary_evaluators(
+    evaluators: Sequence[SUMMARY_EVALUATOR_T],
+) -> List[SUMMARY_EVALUATOR_T]:
+    def _wrap(evaluator: SUMMARY_EVALUATOR_T) -> SUMMARY_EVALUATOR_T:
+        eval_name = getattr(evaluator, "__name__", "BatchEvaluator")
+
+        @functools.wraps(evaluator)
+        def _wrapper_inner(
+            runs: Sequence[schemas.Run], examples: Sequence[schemas.Example]
+        ) -> EvaluationResults:
+            @rh.traceable(name=eval_name)
+            def _wrapper_super_inner(runs_: str, examples_: str) -> EvaluationResults:
+                return evaluator(runs, examples)
+
+            return _wrapper_super_inner(
+                f"Runs[] (Length={len(runs)})", f"Examples[] (Length={len(examples)})"
+            )
+
+        return _wrapper_inner
+
+    results = []
+    for evaluator in evaluators:
+        results.append(_wrap(evaluator))
+    return results
+
+
+class _ForwardResults(TypedDict):
+    run: schemas.Run
+    example: schemas.Example
+
+
+def _forward(
+    fn: rh.SupportsLangsmithExtra,
+    example: schemas.Example,
+    experiment_name: str,
+    metadata: dict,
+    client: langsmith.Client,
+) -> _ForwardResults:
+    run: Optional[schemas.RunBase] = None
+
+    def _get_run(r: run_trees.RunTree) -> None:
+        nonlocal run
+        run = r
+
+    try:
+        fn(
+            example.inputs,
+            langsmith_extra=rh.LangSmithExtra(
+                reference_example_id=example.id,
+                on_end=_get_run,
+                project_name=experiment_name,
+                metadata=metadata,
+                client=client,
+            ),
+        )
+    except Exception as e:
+        logger.error(f"Error running target function: {e}")
+    return _ForwardResults(
+        run=cast(schemas.Run, run),
+        example=example,
+    )
+
+
+def _resolve_data(
+    data: DATA_T, *, client: langsmith.Client
+) -> Iterable[schemas.Example]:
+    """Return the examples for the given dataset."""
+    if isinstance(data, str):
+        return client.list_examples(dataset_name=data)
+    elif isinstance(data, uuid.UUID):
+        return client.list_examples(dataset_id=data)
+    return data
+
+
+def _ensure_traceable(target: TARGET_T) -> rh.SupportsLangsmithExtra:
+    """Ensure the target function is traceable."""
+    if not callable(target):
+        raise ValueError("Target must be a callable function.")
+    if rh.is_traceable_function(target):
+        fn = cast(rh.SupportsLangsmithExtra, target)
+    else:
+        fn = rh.traceable(name="Target")(target)
+    return fn
+
+
+def _resolve_experiment_name(
+    experiment: Optional[schemas.TracerSession],
+    experiment_prefix: Optional[str],
+    runs: Optional[Iterable[schemas.Run]],
+    client: langsmith.Client,
+) -> Tuple[str, Optional[schemas.TracerSession], Optional[Iterable[schemas.Run]]]:
+    if experiment is not None:
+        if not experiment.name:
+            raise ValueError("Experiment name must be defined if provided.")
+        return experiment.name, experiment, runs
+    # If we have runs, that means the experiment was already started.
+    if runs is not None:
+        runs, runs_iter = itertools.tee(runs)
+        first_run = next(runs_iter)
+        experiment = client.read_project(project_id=first_run.session_id)
+        if not experiment.name:
+            raise ValueError("Experiment name not found for provided runs.")
+        return experiment.name, experiment, runs
+    # Otherwise, we will generate a new experiment name.
+    if isinstance(experiment_prefix, str):
+        return experiment_prefix + ":" + uuid.uuid4().hex[:7], None, None
+    return _get_random_name(), None, None
+
+
+def _get_random_name() -> str:
+    from langsmith.evaluation._name_generation import random_name  # noqa: F401
+
+    return random_name()
diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 24f280ef8..65e123042 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -194,6 +194,10 @@ def __call__(
         """  # noqa: E501
         return self.evaluate_run(run, example)
 
+    def __repr__(self) -> str:
+        """String representation of the DynamicRunEvaluator object."""
+        return f"<DynamicRunEvaluator {getattr(self.func, '__name__')}>"
+
 
 def run_evaluator(
     func: Callable[
diff --git a/python/langsmith/evaluation/integrations/__init__.py b/python/langsmith/evaluation/integrations/__init__.py
new file mode 100644
index 000000000..ef8a30345
--- /dev/null
+++ b/python/langsmith/evaluation/integrations/__init__.py
@@ -0,0 +1,8 @@
+"""This module provides integration wrappers for popular open source eval frameworks.
+
+to be used with LangSmith.
+"""
+
+from langsmith.evaluation.integrations._langchain import LangChainStringEvaluator
+
+__all__ = ["LangChainStringEvaluator"]
diff --git a/python/langsmith/evaluation/integrations/_langchain.py b/python/langsmith/evaluation/integrations/_langchain.py
new file mode 100644
index 000000000..3e0f79016
--- /dev/null
+++ b/python/langsmith/evaluation/integrations/_langchain.py
@@ -0,0 +1,243 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Callable, Optional, TypedDict, Union
+
+from langsmith.evaluation.evaluator import run_evaluator
+from langsmith.run_helpers import traceable
+from langsmith.schemas import Example, Run
+
+if TYPE_CHECKING:
+    from langchain.evaluation.schema import StringEvaluator
+
+    from langsmith.evaluation.evaluator import RunEvaluator
+
+
+class SingleEvaluatorInput(TypedDict):
+    """The input to a `StringEvaluator`."""
+
+    prediction: str
+    """The prediction string."""
+    reference: Optional[Any]
+    """The reference string."""
+    input: Optional[str]
+    """The input string."""
+
+
+class LangChainStringEvaluator:
+    r"""A class for wrapping a LangChain StringEvaluator.
+
+    Attributes:
+        evaluator (StringEvaluator): The underlying StringEvaluator OR the name
+            of the evaluator to load.
+
+    Methods:
+        as_run_evaluator() -> RunEvaluator:
+            Convert the LangChainStringEvaluator to a RunEvaluator.
+
+    Examples:
+        Creating a simple LangChainStringEvaluator:
+
+        >>> evaluator = LangChainStringEvaluator("exact_match")
+
+        Converting a LangChainStringEvaluator to a RunEvaluator:
+
+        >>> from langsmith.evaluation import LangChainStringEvaluator
+        >>> evaluator = LangChainStringEvaluator(
+        ...     "criteria",
+        ...     config={
+        ...         "criteria": {
+        ...             "usefulness": "The prediction is useful if"
+        ...             " it is correct and/or asks a useful followup question."
+        ...         },
+        ...     }
+        ... )
+        >>> run_evaluator = evaluator.as_run_evaluator()
+        >>> run_evaluator # doctest: +ELLIPSIS
+        <DynamicRunEvaluator ...>
+
+        Using the `evaluate` API with different evaluators:
+        >>> def prepare_data(run: Run, example: Example):
+        ...     # Convert the evaluation data into the format expected by the evaluator
+        ...     # Only required for datasets with multiple inputs/output keys
+        ...     return {
+        ...         "prediction": run.outputs["prediction"],
+        ...         "reference": example.outputs["answer"],
+        ...         "input": str(example.inputs),
+        ...     }
+        ...
+        >>> import re
+        >>> from langchain_anthropic import ChatAnthropic
+        >>> import langsmith
+        >>> from langsmith.evaluation import LangChainStringEvaluator, evaluate
+        >>> criteria_evaluator = LangChainStringEvaluator(
+        ...     "criteria", config={
+        ...         "criteria": {
+        ...             "usefulness": "The prediction is useful if it is correct"
+        ...                     " and/or asks a useful followup question."
+        ...         },
+        ...         "llm": ChatAnthropic(model="claude-3-opus-20240229")
+        ...     },
+        ...     prepare_data=prepare_data
+        ... )
+        >>> embedding_evaluator = LangChainStringEvaluator("embedding_distance")
+        >>> exact_match_evaluator = LangChainStringEvaluator("exact_match")
+        >>> regex_match_evaluator = LangChainStringEvaluator(
+        ...     "regex_match", config={
+        ...         "flags": re.IGNORECASE
+        ...     },
+        ...     prepare_data=prepare_data
+        ... )
+        >>> scoring_evaluator = LangChainStringEvaluator(
+        ...     "labeled_score_string", config={
+        ...         "criteria": {
+        ...             "accuracy": "Score 1: Completely inaccurate\nScore 5: Somewhat accurate\nScore 10: Completely accurate"
+        ...         },
+        ...         "normalize_by": 10
+        ...     },
+        ...     prepare_data=prepare_data
+        ... )
+        >>> string_distance_evaluator = LangChainStringEvaluator(
+        ...     "string_distance", config={
+        ...         "distance_metric": "levenshtein"
+        ...     },
+        ...     prepare_data=prepare_data
+        ... )
+        >>> from langsmith import Client
+        >>> client = Client()
+        >>> results = evaluate(
+        ...     lambda inputs: {"prediction": "foo"},
+        ...     data=client.list_examples(dataset_name="Evaluate Examples", limit=1),
+        ...     evaluators=[
+        ...         embedding_evaluator,
+        ...         criteria_evaluator,
+        ...         exact_match_evaluator,
+        ...         regex_match_evaluator,
+        ...         scoring_evaluator,
+        ...         string_distance_evaluator
+        ...     ],
+        ... )  # doctest: +ELLIPSIS
+        View the evaluation results for experiment:...
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        evaluator: Union[StringEvaluator, str],
+        *,
+        config: Optional[dict] = None,
+        prepare_data: Optional[
+            Callable[[Run, Optional[Example]], SingleEvaluatorInput]
+        ] = None,
+    ):
+        """Initialize a LangChainStringEvaluator.
+
+        See: https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.StringEvaluator.html#langchain-evaluation-schema-stringevaluator
+
+        Args:
+            evaluator (StringEvaluator): The underlying StringEvaluator.
+        """
+        from langchain.evaluation.schema import StringEvaluator  # noqa: F811
+
+        if isinstance(evaluator, StringEvaluator):
+            self.evaluator = evaluator
+        elif isinstance(evaluator, str):
+            from langchain.evaluation import load_evaluator  # noqa: F811
+
+            self.evaluator = load_evaluator(evaluator, **(config or {}))  # type: ignore[assignment, arg-type]
+        else:
+            raise NotImplementedError(f"Unsupported evaluator type: {type(evaluator)}")
+
+        self._prepare_data = prepare_data
+
+    def as_run_evaluator(
+        self,
+    ) -> RunEvaluator:
+        """Convert the LangChainStringEvaluator to a RunEvaluator.
+
+        This is the object used in the LangSmith `evaluate` API.
+
+        Returns:
+            RunEvaluator: The converted RunEvaluator.
+        """
+        input_str = (
+            "\n       \"input\": example.inputs['input'],"
+            if self.evaluator.requires_input
+            else ""
+        )
+        reference_str = (
+            "\n       \"reference\": example.outputs['expected']"
+            if self.evaluator.requires_reference
+            else ""
+        )
+        customization_error_str = f"""
+def prepare_data(run, example):
+    return {{
+        "prediction": run.outputs['my_output'],{reference_str}{input_str}
+    }}
+evaluator = LangChainStringEvaluator(..., prepare_data=prepare_data)
+"""
+
+        @traceable
+        def prepare_evaluator_inputs(
+            run: Run, example: Optional[Example] = None
+        ) -> SingleEvaluatorInput:
+            if run.outputs and len(run.outputs) > 1:
+                raise ValueError(
+                    f"Evaluator {self.evaluator} only supports a single prediction "
+                    "key. Please ensure that the run has a single output."
+                    " Or initialize with a prepare_data:\n"
+                    f"{customization_error_str}"
+                )
+            if (
+                self.evaluator.requires_reference
+                and example
+                and example.outputs
+                and len(example.outputs) > 1
+            ):
+                raise ValueError(
+                    f"Evaluator {self.evaluator} nly supports a single reference key. "
+                    "Please ensure that the example has a single output."
+                    " Or create a custom evaluator yourself:\n"
+                    f"{customization_error_str}"
+                )
+            if (
+                self.evaluator.requires_input
+                and example
+                and example.inputs
+                and len(example.inputs) > 1
+            ):
+                raise ValueError(
+                    f"Evaluator {self.evaluator} only supports a single input key. "
+                    "Please ensure that the example has a single input."
+                    " Or initialize with a prepare_data:\n"
+                    f"{customization_error_str}"
+                )
+
+            return SingleEvaluatorInput(
+                prediction=next(iter(run.outputs.values())),  # type: ignore[union-attr]
+                reference=(
+                    next(iter(example.outputs.values()))
+                    if (
+                        self.evaluator.requires_reference
+                        and example
+                        and example.outputs
+                    )
+                    else None
+                ),
+                input=(
+                    next(iter(example.inputs.values()))
+                    if (self.evaluator.requires_input and example and example.inputs)
+                    else None
+                ),
+            )
+
+        @traceable(name=self.evaluator.evaluation_name)
+        def evaluate(run: Run, example: Optional[Example] = None) -> dict:
+            eval_inputs = (
+                prepare_evaluator_inputs(run, example)
+                if self._prepare_data is None
+                else self._prepare_data(run, example)
+            )
+            results = self.evaluator.evaluate_strings(**eval_inputs)
+            return {"key": self.evaluator.evaluation_name, **results}
+
+        return run_evaluator(evaluate)
diff --git a/python/langsmith/run_helpers.py b/python/langsmith/run_helpers.py
index 2d3baad26..f35aaaba2 100644
--- a/python/langsmith/run_helpers.py
+++ b/python/langsmith/run_helpers.py
@@ -48,10 +48,43 @@
 
 
 def get_current_run_tree() -> Optional[run_trees.RunTree]:
-    """Get the current run tree context."""
+    """Get the current run tree."""
     return _PARENT_RUN_TREE.get()
 
 
+def get_tracing_context() -> dict:
+    """Get the current tracing context."""
+    return {
+        "parent_run": _PARENT_RUN_TREE.get(),
+        "project_name": _PROJECT_NAME.get(),
+        "tags": _TAGS.get(),
+        "metadata": _METADATA.get(),
+    }
+
+
+@contextlib.contextmanager
+def tracing_context(
+    *,
+    project_name: Optional[str] = None,
+    tags: Optional[List[str]] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+    parent_run: Optional[run_trees.RunTree] = None,
+) -> Generator[None, None, None]:
+    """Set the tracing context for a block of code."""
+    parent_run_ = get_run_tree_context()
+    _PROJECT_NAME.set(project_name)
+    _TAGS.set(tags)
+    _METADATA.set(metadata)
+    _PARENT_RUN_TREE.set(parent_run)
+    try:
+        yield
+    finally:
+        _PROJECT_NAME.set(None)
+        _TAGS.set(None)
+        _METADATA.set(None)
+        _PARENT_RUN_TREE.set(parent_run_)
+
+
 get_run_tree_context = get_current_run_tree
 
 
@@ -106,6 +139,7 @@ class LangSmithExtra(TypedDict, total=False):
     tags: Optional[List[str]]
     run_id: Optional[ls_client.ID_TYPE]
     client: Optional[ls_client.Client]
+    on_end: Optional[Callable[[run_trees.RunTree], Any]]
 
 
 class _TraceableContainer(TypedDict, total=False):
@@ -116,6 +150,7 @@ class _TraceableContainer(TypedDict, total=False):
     outer_project: Optional[str]
     outer_metadata: Optional[Dict[str, Any]]
     outer_tags: Optional[List[str]]
+    on_end: Optional[Callable[[run_trees.RunTree], Any]]
 
 
 class _ContainerInput(TypedDict, total=False):
@@ -150,6 +185,12 @@ def _container_end(
             logger.info(f"See trace: {run_tree.get_url()}")
         except Exception:
             pass
+    on_end = container.get("on_end")
+    if on_end is not None and callable(on_end):
+        try:
+            on_end(run_tree)
+        except Exception as e:
+            logger.warning(f"Failed to run on_end function: {e}")
 
 
 def _collect_extra(extra_outer: dict, langsmith_extra: LangSmithExtra) -> dict:
@@ -178,13 +219,21 @@ def _setup_run(
     outer_project = _PROJECT_NAME.get()
     langsmith_extra = langsmith_extra or LangSmithExtra()
     parent_run_ = langsmith_extra.get("run_tree") or get_run_tree_context()
+    project_cv = _PROJECT_NAME.get()
     selected_project = (
-        _PROJECT_NAME.get()  # From parent trace
+        project_cv  # From parent trace
         or langsmith_extra.get("project_name")  # at invocation time
         or container_input["project_name"]  # at decorator time
         or utils.get_tracer_project()  # default
     )
-    if not parent_run_ and not utils.tracing_is_enabled():
+    reference_example_id = langsmith_extra.get("reference_example_id")
+    id_ = langsmith_extra.get("run_id")
+    if (
+        not project_cv
+        and not reference_example_id
+        and not parent_run_
+        and not utils.tracing_is_enabled()
+    ):
         utils.log_once(
             logging.DEBUG, "LangSmith tracing is disabled, returning original function."
         )
@@ -194,7 +243,9 @@ def _setup_run(
             outer_project=outer_project,
             outer_metadata=None,
             outer_tags=None,
+            on_end=langsmith_extra.get("on_end"),
         )
+    id_ = id_ or str(uuid.uuid4())
     signature = inspect.signature(func)
     name_ = name or func.__name__
     docstring = func.__doc__
@@ -223,7 +274,6 @@ def _setup_run(
     tags_ = (langsmith_extra.get("tags") or []) + (outer_tags or [])
     _TAGS.set(tags_)
     tags_ += tags or []
-    id_ = langsmith_extra.get("run_id", uuid.uuid4())
     client_ = langsmith_extra.get("client", client)
     if parent_run_ is not None:
         new_run = parent_run_.create_child(
@@ -250,7 +300,7 @@ def _setup_run(
             },
             inputs=inputs,
             run_type=run_type,
-            reference_example_id=langsmith_extra.get("reference_example_id"),
+            reference_example_id=reference_example_id,
             project_name=selected_project,
             extra=extra_inner,
             tags=tags_,
@@ -266,6 +316,7 @@ def _setup_run(
         outer_project=outer_project,
         outer_metadata=outer_metadata,
         outer_tags=outer_tags,
+        on_end=langsmith_extra.get("on_end"),
     )
     _PROJECT_NAME.set(response_container["project_name"])
     _PARENT_RUN_TREE.set(response_container["new_run"])
@@ -291,7 +342,7 @@ class SupportsLangsmithExtra(Protocol, Generic[R]):
 
     Args:
         *args: Variable length arguments.
-        langsmith_extra (Optional[Dict[str, Any]]): Optional dictionary of
+        langsmith_extra (Optional[LangSmithExtra): Optional dictionary of
             additional parameters for Langsmith.
         **kwargs: Keyword arguments.
 
@@ -302,7 +353,7 @@ class SupportsLangsmithExtra(Protocol, Generic[R]):
     def __call__(
         self,
         *args: Any,
-        langsmith_extra: Optional[Dict[str, Any]] = None,
+        langsmith_extra: Optional[LangSmithExtra] = None,
         **kwargs: Any,
     ) -> R:
         """Call the instance when it is called as a function.
diff --git a/python/poetry.lock b/python/poetry.lock
index 89c1800ff..2890bf05b 100644
--- a/python/poetry.lock
+++ b/python/poetry.lock
@@ -1041,6 +1041,17 @@ files = [
 [package.dependencies]
 urllib3 = ">=2"
 
+[[package]]
+name = "types-tqdm"
+version = "4.66.0.20240106"
+description = "Typing stubs for tqdm"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "types-tqdm-4.66.0.20240106.tar.gz", hash = "sha256:7acf4aade5bad3ded76eb829783f9961b1c2187948eaa6dd1ae8644dff95a938"},
+    {file = "types_tqdm-4.66.0.20240106-py3-none-any.whl", hash = "sha256:7459b0f441b969735685645a5d8480f7912b10d05ab45f99a2db8a8e45cb550b"},
+]
+
 [[package]]
 name = "typing-extensions"
 version = "4.8.0"
@@ -1126,4 +1137,4 @@ watchmedo = ["PyYAML (>=3.10)"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "79450ee1ccc1bb4c2c4b1a49ce97f336387f45a432d7c269737be112abfae8d7"
+content-hash = "d410c02f3874131fe4b49ae7db5576b257263d38ed2972f5ce3c9d8ea6b010ef"
diff --git a/python/pyproject.toml b/python/pyproject.toml
index dbb10ad5d..5f7e9a53a 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.1.31"
+version = "0.1.33"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <support@langchain.dev>"]
 license = "MIT"
@@ -49,6 +49,7 @@ pytest-watcher = "^0.3.4"
 pytest-xdist = "^3.5.0"
 pytest-cov = "^4.1.0"
 dataclasses-json = "^0.6.4"
+types-tqdm = "^4.66.0.20240106"
 
 [tool.poetry.group.lint.dependencies]
 openai = "^1.10"
diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
index bccfb2f55..3ee7c81a0 100644
--- a/python/tests/integration_tests/test_client.py
+++ b/python/tests/integration_tests/test_client.py
@@ -42,36 +42,6 @@ def langchain_client(monkeypatch: pytest.MonkeyPatch) -> Client:
     return Client()
 
 
-def test_projects(langchain_client: Client, monkeypatch: pytest.MonkeyPatch) -> None:
-    """Test projects."""
-    new_project = "__Test Project"
-    if langchain_client.has_project(new_project):
-        langchain_client.delete_project(project_name=new_project)
-
-    monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com")
-    langchain_client.create_project(
-        project_name=new_project,
-        project_extra={"evaluator": "THE EVALUATOR"},
-    )
-    project = langchain_client.read_project(project_name=new_project)
-    assert project.name == new_project
-    runs = list(langchain_client.list_runs(project_name=new_project))
-    project_id_runs = list(langchain_client.list_runs(project_id=project.id))
-    assert len(runs) == len(project_id_runs) == 0
-    langchain_client.delete_project(project_name=new_project)
-
-    with pytest.raises(LangSmithError):
-        langchain_client.read_project(project_name=new_project)
-    assert new_project not in set(
-        [
-            sess.name
-            for sess in langchain_client.list_projects(name_contains=new_project)
-        ]
-    )
-    with pytest.raises(LangSmithError):
-        langchain_client.delete_project(project_name=new_project)
-
-
 def test_datasets(langchain_client: Client) -> None:
     """Test datasets."""
     csv_content = "col1,col2\nval1,val2"
@@ -182,22 +152,6 @@ def test_error_surfaced_invalid_uri(monkeypatch: pytest.MonkeyPatch, uri: str) -
         client.create_run("My Run", inputs={"text": "hello world"}, run_type="llm")
 
 
-@freeze_time("2023-01-01")
-def test_create_project(
-    monkeypatch: pytest.MonkeyPatch, langchain_client: Client
-) -> None:
-    """Test the project creation"""
-    monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com")
-    project_name = "__test_create_project" + uuid4().hex[:4]
-    if langchain_client.has_project(project_name):
-        langchain_client.delete_project(project_name=project_name)
-    try:
-        project = langchain_client.create_project(project_name=project_name)
-        assert project.name == project_name
-    finally:
-        langchain_client.delete_project(project_name=project_name)
-
-
 def test_create_dataset(
     monkeypatch: pytest.MonkeyPatch, langchain_client: Client
 ) -> None: