Add Experiment Description Support (#657)

langchain-ai · May 4, 2024 · 6f6e230 · 6f6e230
1 parent 73da8f1
commit 6f6e230
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 3 deletions.
diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts
@@ -94,6 +94,10 @@ export interface EvaluateOptions {
    * @default undefined
    */
   experimentPrefix?: string;
+  /**
+   * A free-form description of the experiment.
+   */
+  description?: string;
   /**
    * The maximum number of concurrent evaluations to run.
    * @default undefined
@@ -152,6 +156,7 @@ class _ExperimentManager {
   _experimentName: string;
 
   _metadata: KVMap;
+  _description?: string;
 
   get experimentName(): string {
     if (this._experimentName) {
@@ -297,6 +302,7 @@ class _ExperimentManager {
           projectName: this.experimentName,
           referenceDatasetId: firstExample.dataset_id,
           metadata: projectMetadata,
+          description: this._description,
         });
       } catch (e) {
         if (String(e).includes("already exists")) {

diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts
@@ -40,7 +40,10 @@ test("evaluate can evaluate", async () => {
     };
   };
 
-  const evalRes = await evaluate(targetFunc, { data: TESTING_DATASET_NAME });
+  const evalRes = await evaluate(targetFunc, {
+    data: TESTING_DATASET_NAME,
+    description: "Experiment from evaluate can evaluate integration test",
+  });
   // console.log(evalRes.results)
   expect(evalRes.results).toHaveLength(2);
 
@@ -88,6 +91,7 @@ test("evaluate can evaluate with RunEvaluator evaluators", async () => {
   const evalRes = await evaluate(targetFunc, {
     data: TESTING_DATASET_NAME,
     evaluators: [evaluator],
+    description: "evaluate can evaluate with RunEvaluator evaluators",
   });
 
   expect(evalRes.results).toHaveLength(2);
@@ -159,6 +163,7 @@ test("evaluate can evaluate with custom evaluators", async () => {
   const evalRes = await evaluate(targetFunc, {
     data: TESTING_DATASET_NAME,
     evaluators: [customEvaluator],
+    description: "evaluate can evaluate with custom evaluators",
   });
 
   expect(evalRes.results).toHaveLength(2);
@@ -235,6 +240,7 @@ test("evaluate can evaluate with summary evaluators", async () => {
   const evalRes = await evaluate(targetFunc, {
     data: TESTING_DATASET_NAME,
     summaryEvaluators: [customSummaryEvaluator],
+    description: "evaluate can evaluate with summary evaluators",
   });
 
   expect(evalRes.summaryResults.results).toHaveLength(1);
@@ -289,6 +295,7 @@ test.skip("can iterate over evaluate results", async () => {
   const evalRes = await evaluate(targetFunc, {
     data: TESTING_DATASET_NAME,
     evaluators: [evaluator],
+    description: "can iterate over evaluate results",
   });
 
   for await (const item of evalRes) {
@@ -329,6 +336,7 @@ test("can pass multiple evaluators", async () => {
   const evalRes = await evaluate(targetFunc, {
     data: TESTING_DATASET_NAME,
     evaluators: evaluators,
+    description: "can pass multiple evaluators",
   });
   expect(evalRes.results).toHaveLength(2);
   const firstEvalResults = evalRes.results[0];
@@ -370,6 +378,7 @@ test("can pass multiple summary evaluators", async () => {
   const evalRes = await evaluate(targetFunc, {
     data: TESTING_DATASET_NAME,
     summaryEvaluators,
+    description: "can pass multiple summary evaluators",
   });
   expect(evalRes.results).toHaveLength(2);
 
@@ -415,6 +424,7 @@ test("can pass AsyncIterable of Example's to evaluator instead of dataset name",
   const evalRes = await evaluate(targetFunc, {
     data: examplesIterator,
     evaluators: [customEvaluator],
+    description: "can pass AsyncIterable of Example's to evaluator",
   });
 
   const firstEvalResults = evalRes.results[0];
@@ -449,6 +459,7 @@ test("max concurrency works with custom evaluators", async () => {
     data: TESTING_DATASET_NAME,
     evaluators: [customEvaluator],
     maxConcurrency: 1,
+    description: "max concurrency works with custom evaluators",
   });
 
   expect(evalRes.results).toHaveLength(2);
@@ -489,6 +500,7 @@ test("max concurrency works with summary evaluators", async () => {
     data: TESTING_DATASET_NAME,
     summaryEvaluators: [customSummaryEvaluator],
     maxConcurrency: 1,
+    description: "max concurrency works with summary evaluators",
   });
 
   expect(evalRes.results).toHaveLength(2);
@@ -530,6 +542,7 @@ test("Target func can be a runnable", async () => {
   const evalRes = await evaluate(targetFunc, {
     data: TESTING_DATASET_NAME,
     evaluators: [evaluator],
+    description: "Target func can be a runnable",
   });
 
   expect(evalRes.results).toHaveLength(2);
@@ -580,6 +593,7 @@ test("evaluate can accept array of examples", async () => {
   const evalRes = await evaluate(targetFunc, {
     data: examples,
     evaluators: [customEvaluator],
+    description: "evaluate can accept array of examples",
   });
 
   const firstEvalResults = evalRes.results[0];

diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
@@ -59,6 +59,7 @@ async def aevaluate(
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
+    description: Optional[str] = None,
     max_concurrency: Optional[int] = None,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
@@ -77,6 +78,7 @@ async def aevaluate(
             Defaults to None.
         experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
             Defaults to None.
+        description (Optional[str]): A description of the experiment.
         max_concurrency (Optional[int]): The maximum number of concurrent
             evaluations to run. Defaults to None.
         client (Optional[langsmith.Client]): The LangSmith client to use.
@@ -134,6 +136,7 @@ async def aevaluate(
         ...         evaluators=[accuracy],
         ...         summary_evaluators=[precision],
         ...         experiment_prefix="My Experiment",
+        ...         description="Evaluate the accuracy of the model asynchronously.",
         ...         metadata={
         ...             "my-prompt-version": "abcd-1234",
         ...         },
@@ -154,6 +157,7 @@ async def aevaluate(
         ...         evaluators=[accuracy],
         ...         summary_evaluators=[precision],
         ...         experiment_prefix="My Subset Experiment",
+        ...         description="Evaluate a subset of examples asynchronously.",
         ...     )
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
@@ -167,6 +171,7 @@ async def aevaluate(
         ...         evaluators=[accuracy],
         ...         summary_evaluators=[precision],
         ...         experiment_prefix="My Streaming Experiment",
+        ...         description="Streaming predictions for debugging.",
         ...         blocking=False,
         ...     )
         ... )  # doctest: +ELLIPSIS
@@ -186,6 +191,7 @@ async def aevaluate(
         ...         evaluators=[accuracy],
         ...         summary_evaluators=[precision],
         ...         experiment_prefix="My Experiment Without Concurrency",
+        ...         description="This was run without concurrency.",
         ...         max_concurrency=0,
         ...     )
         ... )  # doctest: +ELLIPSIS
@@ -205,6 +211,7 @@ async def aevaluate(
         ...         evaluators=[helpfulness],
         ...         summary_evaluators=[precision],
         ...         experiment_prefix="My Helpful Experiment",
+        ...         description="Applying async evaluators example.",
         ...     )
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
@@ -216,6 +223,7 @@ async def aevaluate(
         summary_evaluators=summary_evaluators,
         metadata=metadata,
         experiment_prefix=experiment_prefix,
+        description=description,
         max_concurrency=max_concurrency,
         client=client,
         blocking=blocking,
@@ -333,6 +341,7 @@ async def _aevaluate(
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
+    description: Optional[str] = None,
     max_concurrency: Optional[int] = None,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
@@ -353,6 +362,7 @@ async def _aevaluate(
         client=client,
         metadata=metadata,
         experiment=experiment_ or experiment_prefix,
+        description=description,
         runs=runs,
     ).astart()
     cache_dir = ls_utils.get_cache_dir(None)
@@ -392,6 +402,7 @@ class _AsyncExperimentManager(_ExperimentManagerMixin):
         experiment (Optional[schemas.TracerSession]): The tracer session
             associated with the experiment.
         experiment_prefix (Optional[str]): The prefix for the experiment name.
+        description (Optional[str]): The description for the experiment.
         metadata (Optional[dict]): Additional metadata for the experiment.
         client (Optional[langsmith.Client]): The Langsmith client used for
              the experiment.
@@ -411,11 +422,13 @@ def __init__(
         client: Optional[langsmith.Client] = None,
         evaluation_results: Optional[AsyncIterable[EvaluationResults]] = None,
         summary_results: Optional[AsyncIterable[EvaluationResults]] = None,
+        description: Optional[str] = None,
     ):
         super().__init__(
             experiment=experiment,
             metadata=metadata,
             client=client,
+            description=description,
         )
         self._data = data
         self._examples: Optional[AsyncIterable[schemas.Example]] = None

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
@@ -77,6 +77,7 @@ def evaluate(
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
+    description: Optional[str] = None,
     max_concurrency: Optional[int] = None,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
@@ -95,6 +96,7 @@ def evaluate(
             Defaults to None.
         experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
             Defaults to None.
+        description (Optional[str]): A free-form text description for the experiment.
         max_concurrency (Optional[int]): The maximum number of concurrent
             evaluations to run. Defaults to None.
         client (Optional[langsmith.Client]): The LangSmith client to use.
@@ -142,6 +144,8 @@ def evaluate(
         ...     data=dataset_name,
         ...     evaluators=[accuracy],
         ...     summary_evaluators=[precision],
+        ...     experiment_prefix="My Experiment",
+        ...     description="Evaluating the accuracy of a simple prediction model.",
         ...     metadata={
         ...         "my-prompt-version": "abcd-1234",
         ...     },
@@ -158,6 +162,7 @@ def evaluate(
         ...     evaluators=[accuracy],
         ...     summary_evaluators=[precision],
         ...     experiment_prefix="My Experiment",
+        ...     description="Just testing a subset synchronously.",
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
 
@@ -168,6 +173,7 @@ def evaluate(
         ...     data=dataset_name,
         ...     evaluators=[accuracy],
         ...     summary_evaluators=[precision],
+        ...     description="I don't even have to block!",
         ...     blocking=False,
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
@@ -200,6 +206,7 @@ def evaluate(
         ...             prepare_data=prepare_criteria_data,
         ...         ),
         ...     ],
+        ...     description="Evaluating with off-the-shelf LangChain evaluators.",
         ...     summary_evaluators=[precision],
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
@@ -217,6 +224,7 @@ def evaluate(
         ...     lc_predict.invoke,
         ...     data=dataset_name,
         ...     evaluators=[accuracy],
+        ...     description="This time we're evaluating a LangChain object.",
         ...     summary_evaluators=[precision],
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
@@ -228,6 +236,7 @@ def evaluate(
         summary_evaluators=summary_evaluators,
         metadata=metadata,
         experiment_prefix=experiment_prefix,
+        description=description,
         max_concurrency=max_concurrency,
         client=client,
         blocking=blocking,
@@ -413,6 +422,7 @@ def _evaluate(
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
+    description: Optional[str] = None,
     max_concurrency: Optional[int] = None,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
@@ -432,6 +442,7 @@ def _evaluate(
         client=client,
         metadata=metadata,
         experiment=experiment_ or experiment_prefix,
+        description=description,
         # If provided, we don't need to create a new experiment.
         runs=runs,
         # Create or resolve the experiment.
@@ -525,6 +536,7 @@ def __init__(
         experiment: Optional[Union[schemas.TracerSession, str]],
         metadata: Optional[dict] = None,
         client: Optional[langsmith.Client] = None,
+        description: Optional[str] = None,
     ):
         self.client = client or langsmith.Client()
         self._experiment: Optional[schemas.TracerSession] = None
@@ -545,6 +557,7 @@ def __init__(
                 **metadata,
             }
         self._metadata = metadata or {}
+        self._description = description
 
     @property
     def experiment_name(self) -> str:
@@ -580,6 +593,7 @@ def _get_project(self, first_example: schemas.Example) -> schemas.TracerSession:
                 project_metadata = self._get_experiment_metadata()
                 project = self.client.create_project(
                     self.experiment_name,
+                    description=self._description,
                     reference_dataset_id=first_example.dataset_id,
                     metadata=project_metadata,
                 )
@@ -649,11 +663,13 @@ def __init__(
         runs: Optional[Iterable[schemas.Run]] = None,
         evaluation_results: Optional[Iterable[EvaluationResults]] = None,
         summary_results: Optional[Iterable[EvaluationResults]] = None,
+        description: Optional[str] = None,
     ):
         super().__init__(
             experiment=experiment,
             metadata=metadata,
             client=client,
+            description=description,
         )
         self._data = data
         self._examples: Optional[Iterable[schemas.Example]] = None

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.1.53"
+version = "0.1.54"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <[email protected]>"]
 license = "MIT"

diff --git a/python/tests/evaluation/test_evaluation.py b/python/tests/evaluation/test_evaluation.py
@@ -34,6 +34,7 @@ def predict(inputs: dict) -> dict:
         data=dataset_name,
         evaluators=[accuracy],
         summary_evaluators=[precision],
+        description="My sync experiment",
         metadata={
             "my-prompt-version": "abcd-1234",
             "function": "evaluate",
@@ -70,9 +71,10 @@ async def apredict(inputs: dict) -> dict:
         evaluators=[accuracy],
         summary_evaluators=[precision],
         experiment_prefix="My Experiment",
+        description="My Experiment Description",
         metadata={
             "my-prompt-version": "abcd-1234",
-            "function": "agevaluate",
+            "function": "aevaluate",
         },
     )