feat(pairwise evals): add option to randomize order of experiments; p…

…rint URL for pairwise experiment (#672)
langchain-ai · May 8, 2024 · 1f465d8 · 1f465d8
2 parents 8ea8bda + afd97f5
commit 1f465d8
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 2 deletions.
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
@@ -9,6 +9,7 @@
 import itertools
 import logging
 import pathlib
+import random
 import threading
 import uuid
 from contextvars import copy_context
@@ -431,6 +432,7 @@ def evaluate_comparative(
     client: Optional[langsmith.Client] = None,
     metadata: Optional[dict] = None,
     load_nested: bool = False,
+    randomize_order: bool = False,
 ) -> ComparativeExperimentResults:
     r"""Evaluate existing experiment runs against each other.
 
@@ -453,6 +455,8 @@ def evaluate_comparative(
             Defaults to None.
         load_nested (bool): Whether to load all child runs for the experiment.
             Default is to only load the top-level root runs.
+        randomize_order (bool): Whether to randomize the order of the experiments before evaluation.
+            Default is False.
 
     Returns:
         ComparativeExperimentResults: The results of the comparative evaluation.
@@ -607,7 +611,13 @@ def evaluate_comparative(
         metadata=metadata,
         id=comparative_experiment_id,
     )
-    # TODO: Print out the URL for the experiment.
+    _print_comparative_experiment_start(
+        cast(
+            Tuple[schemas.TracerSessionResult, schemas.TracerSessionResult],
+            tuple(projects),
+        ),
+        comparative_experiment,
+    )
     runs = [
         _load_traces(experiment, client, load_nested=load_nested)
         for experiment in experiments
@@ -649,6 +659,8 @@ def evaluate_and_submit_feedback(
         runs_list: list[schemas.Run], example: schemas.Example, executor: cf.Executor
     ) -> ComparisonEvaluationResult:
         feedback_group_id = uuid.uuid4()
+        if randomize_order:
+            random.shuffle(runs_list)
         result = comparator.compare_runs(runs_list, example)
         if client is None:
             raise ValueError("Client is required to submit feedback.")
@@ -720,6 +732,25 @@ def __getitem__(self, key):
 ## Private API
 
 
+def _print_comparative_experiment_start(
+    experiments: Tuple[schemas.TracerSession, schemas.TracerSession],
+    comparative_experiment: schemas.ComparativeExperiment,
+) -> None:
+    url = experiments[0].url or experiments[1].url
+    if url:
+        project_url = url.split("?")[0]
+        dataset_id = comparative_experiment.reference_dataset_id
+        base_url = project_url.split("/projects/p/")[0]
+        comparison_url = (
+            f"{base_url}/datasets/{dataset_id}/compare?"
+            f"selectedSessions={'%2C'.join([str(e.id) for e in experiments])}"
+            f"&comparativeExperiment={comparative_experiment.id}"
+        )
+        print(  # noqa: T201
+            f"View the pairwise evaluation results at:\n{comparison_url}\n\n"
+        )
+
+
 def _is_callable(target: Union[TARGET_T, Iterable[schemas.Run]]) -> bool:
     return callable(target) or (hasattr(target, "invoke") and callable(target.invoke))
 

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.1.55"
+version = "0.1.56"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <[email protected]>"]
 license = "MIT"