diff --git a/js/src/schemas.ts b/js/src/schemas.ts
index 9ce47af21..bcf94ee62 100644
--- a/js/src/schemas.ts
+++ b/js/src/schemas.ts
@@ -32,8 +32,8 @@ export interface TracerSessionResult extends TracerSession {
   last_run_start_time?: number;
   // Feedback stats for the session.
   feedback_stats?: Record<string, unknown>;
-  // The reference dataset IDs this session's runs were generated on.
-  reference_dataset_ids?: string[];
+  // The reference dataset ID this session's runs were generated on.
+  reference_dataset_id?: string;
   // Facets for the runs in the session.
   run_facets?: KVMap[];
 }
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index 207b10bf6..465e14944 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -77,23 +77,23 @@ def evaluate(
     r"""Evaluate a target system or function on a given dataset.
 
     Args:
-    target (TARGET_T): The target system or function to evaluate.
-    data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of
-        examples, or a generator of examples.
-    evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run
-        on each example. Defaults to None.
-    summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary
-        evaluators to run on the entire dataset. Defaults to None.
-    metadata (Optional[dict]): Metadata to attach to the experiment.
-        Defaults to None.
-    experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
-        Defaults to None.
-    max_concurrency (Optional[int]): The maximum number of concurrent
-        evaluations to run. Defaults to None.
-    client (Optional[langsmith.Client]): The LangSmith client to use.
-        Defaults to None.
-    blocking (bool): Whether to block until the evaluation is complete.
-        Defaults to True.
+        target (TARGET_T): The target system or function to evaluate.
+        data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of
+            examples, or a generator of examples.
+        evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run
+            on each example. Defaults to None.
+        summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary
+            evaluators to run on the entire dataset. Defaults to None.
+        metadata (Optional[dict]): Metadata to attach to the experiment.
+            Defaults to None.
+        experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
+            Defaults to None.
+        max_concurrency (Optional[int]): The maximum number of concurrent
+            evaluations to run. Defaults to None.
+        client (Optional[langsmith.Client]): The LangSmith client to use.
+            Defaults to None.
+        blocking (bool): Whether to block until the evaluation is complete.
+            Defaults to True.
 
     Returns:
         ExperimentResults: The results of the evaluation.
@@ -237,7 +237,6 @@ def evaluate(
 def evaluate_existing(
     experiment: Union[str, uuid.UUID],
     /,
-    data: DATA_T,
     evaluators: Optional[Sequence[EVALUATOR_T]] = None,
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
@@ -297,13 +296,21 @@ def evaluate_existing(
         >>> experiment_name = "My Experiment:64e6e91" # Or manually specify
         >>> results = evaluate_existing(
         ...     experiment_name,
-        ...     data=dataset_name,
         ...     summary_evaluators=[precision],
         ... ) # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
     """  # noqa: E501
     client = client or langsmith.Client()
+    project = _load_experiment(experiment, client)
     runs = _load_traces(experiment, client, load_nested=load_nested)
+    data = list(
+        client.list_examples(
+            dataset_id=project.reference_dataset_id,
+            as_of=project.metadata.get("dataset_version"),
+        )
+    )
+    runs = sorted(runs, key=lambda r: str(r.reference_example_id))
+    data = sorted(data, key=lambda d: str(d.id))
     return _evaluate(
         runs,
         data=data,
@@ -403,6 +410,7 @@ def _evaluate(
     max_concurrency: Optional[int] = None,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
+    experiment: Optional[schemas.TracerSession] = None,
 ) -> ExperimentResults:
     # Initialize the experiment manager.
     manager = _ExperimentManager(
@@ -410,6 +418,7 @@ def _evaluate(
         client=client,
         metadata=metadata,
         experiment_prefix=experiment_prefix,
+        experiment=experiment,
         # If provided, we don't need to create a new experiment.
         runs=None if _is_callable(target) else cast(Iterable[schemas.Run], target),
         # Create or resolve the experiment.
@@ -441,9 +450,17 @@ def _is_uuid(value: str) -> bool:
         return False
 
 
+def _load_experiment(
+    project: Union[str, uuid.UUID], client: langsmith.Client
+) -> schemas.TracerSession:
+    if isinstance(project, uuid.UUID) or _is_uuid(project):
+        return client.read_project(project_id=project)
+    return client.read_project(project_name=project)
+
+
 def _load_traces(
     project: Union[str, uuid.UUID], client: langsmith.Client, load_nested: bool = False
-) -> Iterable[schemas.Run]:
+) -> List[schemas.Run]:
     """Load nested traces for a given project."""
     execution_order = None if load_nested else 1
     if isinstance(project, uuid.UUID) or _is_uuid(project):
@@ -451,7 +468,7 @@ def _load_traces(
     else:
         runs = client.list_runs(project_name=project, execution_order=execution_order)
     if not load_nested:
-        return runs
+        return list(runs)
 
     treemap: DefaultDict[uuid.UUID, List[schemas.Run]] = collections.defaultdict(list)
     results = []
@@ -938,7 +955,14 @@ def _get_run(r: run_trees.RunTree) -> None:
                 reference_example_id=example.id,
                 on_end=_get_run,
                 project_name=experiment_name,
-                metadata=metadata,
+                metadata={
+                    **metadata,
+                    "example_version": (
+                        example.modified_at.isoformat()
+                        if example.modified_at
+                        else example.created_at.isoformat()
+                    ),
+                },
                 client=client,
             ),
         )
diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 65e123042..a077121ff 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -161,7 +161,9 @@ def evaluate_run(
             Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
         """  # noqa: E501
         source_run_id = uuid.uuid4()
-        metadata = {"target_run_id": run.id, "experiment": run.session_id}
+        metadata = {"target_run_id": run.id}
+        if getattr(run, "session_id"):
+            metadata["experiment"] = run.session_id
         result = self.func(
             run,
             example,
diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py
index 372894cea..97e5b7cfa 100644
--- a/python/langsmith/schemas.py
+++ b/python/langsmith/schemas.py
@@ -558,7 +558,7 @@ class TracerSessionResult(TracerSession):
     """The start time of the last run in the project."""
     feedback_stats: Optional[Dict[str, Any]]
     """Feedback stats for the project."""
-    reference_dataset_ids: Optional[List[UUID]]
+    reference_dataset_id: Optional[UUID]
     """The reference dataset IDs this project's runs were generated on."""
     run_facets: Optional[List[Dict[str, Any]]]
     """Facets for the runs in the project."""