Merge branch 'main' into dqbd/entrypoints-cjs

langchain-ai · Mar 27, 2024 · 843005f · 843005f
2 parents 13c5dbb + 25808cf
commit 843005f
Show file tree

Hide file tree

Showing 5 changed files with 77 additions and 39 deletions.
diff --git a/js/src/schemas.ts b/js/src/schemas.ts
@@ -32,8 +32,8 @@ export interface TracerSessionResult extends TracerSession {
   last_run_start_time?: number;
   // Feedback stats for the session.
   feedback_stats?: Record<string, unknown>;
-  // The reference dataset IDs this session's runs were generated on.
-  reference_dataset_ids?: string[];
+  // The reference dataset ID this session's runs were generated on.
+  reference_dataset_id?: string;
   // Facets for the runs in the session.
   run_facets?: KVMap[];
 }

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
@@ -77,23 +77,23 @@ def evaluate(
     r"""Evaluate a target system or function on a given dataset.
 
     Args:
-    target (TARGET_T): The target system or function to evaluate.
-    data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of
-        examples, or a generator of examples.
-    evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run
-        on each example. Defaults to None.
-    summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary
-        evaluators to run on the entire dataset. Defaults to None.
-    metadata (Optional[dict]): Metadata to attach to the experiment.
-        Defaults to None.
-    experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
-        Defaults to None.
-    max_concurrency (Optional[int]): The maximum number of concurrent
-        evaluations to run. Defaults to None.
-    client (Optional[langsmith.Client]): The LangSmith client to use.
-        Defaults to None.
-    blocking (bool): Whether to block until the evaluation is complete.
-        Defaults to True.
+        target (TARGET_T): The target system or function to evaluate.
+        data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of
+            examples, or a generator of examples.
+        evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run
+            on each example. Defaults to None.
+        summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary
+            evaluators to run on the entire dataset. Defaults to None.
+        metadata (Optional[dict]): Metadata to attach to the experiment.
+            Defaults to None.
+        experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
+            Defaults to None.
+        max_concurrency (Optional[int]): The maximum number of concurrent
+            evaluations to run. Defaults to None.
+        client (Optional[langsmith.Client]): The LangSmith client to use.
+            Defaults to None.
+        blocking (bool): Whether to block until the evaluation is complete.
+            Defaults to True.
 
     Returns:
         ExperimentResults: The results of the evaluation.
@@ -138,6 +138,9 @@ def evaluate(
         ...     data=dataset_name,
         ...     evaluators=[accuracy],
         ...     summary_evaluators=[precision],
+        ...     metadata={
+        ...         "my-prompt-version": "abcd-1234",
+        ...    },
         ... ) # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
 
@@ -234,7 +237,6 @@ def evaluate(
 def evaluate_existing(
     experiment: Union[str, uuid.UUID],
     /,
-    data: DATA_T,
     evaluators: Optional[Sequence[EVALUATOR_T]] = None,
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
@@ -294,13 +296,21 @@ def evaluate_existing(
         >>> experiment_name = "My Experiment:64e6e91" # Or manually specify
         >>> results = evaluate_existing(
         ...     experiment_name,
-        ...     data=dataset_name,
         ...     summary_evaluators=[precision],
         ... ) # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
     """  # noqa: E501
     client = client or langsmith.Client()
+    project = _load_experiment(experiment, client)
     runs = _load_traces(experiment, client, load_nested=load_nested)
+    data = list(
+        client.list_examples(
+            dataset_id=project.reference_dataset_id,
+            as_of=project.metadata.get("dataset_version"),
+        )
+    )
+    runs = sorted(runs, key=lambda r: str(r.reference_example_id))
+    data = sorted(data, key=lambda d: str(d.id))
     return _evaluate(
         runs,
         data=data,
@@ -400,13 +410,15 @@ def _evaluate(
     max_concurrency: Optional[int] = None,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
+    experiment: Optional[schemas.TracerSession] = None,
 ) -> ExperimentResults:
     # Initialize the experiment manager.
     manager = _ExperimentManager(
         data,
         client=client,
         metadata=metadata,
         experiment_prefix=experiment_prefix,
+        experiment=experiment,
         # If provided, we don't need to create a new experiment.
         runs=None if _is_callable(target) else cast(Iterable[schemas.Run], target),
         # Create or resolve the experiment.
@@ -438,17 +450,25 @@ def _is_uuid(value: str) -> bool:
         return False
 
 
+def _load_experiment(
+    project: Union[str, uuid.UUID], client: langsmith.Client
+) -> schemas.TracerSession:
+    if isinstance(project, uuid.UUID) or _is_uuid(project):
+        return client.read_project(project_id=project)
+    return client.read_project(project_name=project)
+
+
 def _load_traces(
     project: Union[str, uuid.UUID], client: langsmith.Client, load_nested: bool = False
-) -> Iterable[schemas.Run]:
+) -> List[schemas.Run]:
     """Load nested traces for a given project."""
     execution_order = None if load_nested else 1
     if isinstance(project, uuid.UUID) or _is_uuid(project):
         runs = client.list_runs(project_id=project, execution_order=execution_order)
     else:
         runs = client.list_runs(project_name=project, execution_order=execution_order)
     if not load_nested:
-        return runs
+        return list(runs)
 
     treemap: DefaultDict[uuid.UUID, List[schemas.Run]] = collections.defaultdict(list)
     results = []
@@ -560,13 +580,7 @@ def start(self) -> _ExperimentManager:
         _examples = itertools.chain([first_example], self.examples)
         if self._experiment is None:
             try:
-                project_metadata = self._metadata or {}
-                git_info = ls_env.get_git_info()
-                if git_info:
-                    project_metadata = {
-                        **project_metadata,
-                        "git": git_info,
-                    }
+                project_metadata = self._get_experiment_metadata()
                 project = self.client.create_project(
                     self.experiment_name,
                     reference_dataset_id=first_example.dataset_id,
@@ -702,6 +716,21 @@ def get_summary_scores(self) -> Dict[str, List[dict]]:
 
     # Private methods.
 
+    def _get_experiment_metadata(self):
+        project_metadata = self._metadata or {}
+        git_info = ls_env.get_git_info()
+        if git_info:
+            project_metadata = {
+                **project_metadata,
+                "git": git_info,
+            }
+        if self._experiment:
+            project_metadata = {
+                **self._experiment.metadata,
+                **project_metadata,
+            }
+        return project_metadata
+
     def _get_experiment(self) -> schemas.TracerSession:
         if self._experiment is None:
             raise ValueError("Experiment not started yet.")
@@ -716,15 +745,15 @@ def _end(self) -> None:
         # Should always be defined in practice when fetched,
         # but the typing permits None
         max_modified_at = max(modified_at) if modified_at else None
+        project_metadata = self._get_experiment_metadata()
+        project_metadata["dataset_version"] = (
+            max_modified_at.isoformat() if max_modified_at else None
+        )
 
         self.client.update_project(
             experiment.id,
             end_time=datetime.datetime.now(datetime.timezone.utc),
-            metadata={
-                "dataset_version": (
-                    max_modified_at.isoformat() if max_modified_at else None
-                )
-            },
+            metadata=project_metadata,
         )
 
     def _predict(
@@ -926,7 +955,14 @@ def _get_run(r: run_trees.RunTree) -> None:
                 reference_example_id=example.id,
                 on_end=_get_run,
                 project_name=experiment_name,
-                metadata=metadata,
+                metadata={
+                    **metadata,
+                    "example_version": (
+                        example.modified_at.isoformat()
+                        if example.modified_at
+                        else example.created_at.isoformat()
+                    ),
+                },
                 client=client,
             ),
         )

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
@@ -161,7 +161,9 @@ def evaluate_run(
             Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
         """  # noqa: E501
         source_run_id = uuid.uuid4()
-        metadata = {"target_run_id": run.id, "experiment": run.session_id}
+        metadata = {"target_run_id": run.id}
+        if getattr(run, "session_id"):
+            metadata["experiment"] = run.session_id
         result = self.func(
             run,
             example,

diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py
@@ -558,7 +558,7 @@ class TracerSessionResult(TracerSession):
     """The start time of the last run in the project."""
     feedback_stats: Optional[Dict[str, Any]]
     """Feedback stats for the project."""
-    reference_dataset_ids: Optional[List[UUID]]
+    reference_dataset_id: Optional[UUID]
     """The reference dataset IDs this project's runs were generated on."""
     run_facets: Optional[List[Dict[str, Any]]]
     """Facets for the runs in the project."""

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.1.33"
+version = "0.1.34"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <[email protected]>"]
 license = "MIT"