From 6f6e23096a5b199beee72162f4dab6acaf563766 Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Fri, 3 May 2024 17:15:03 -0700 Subject: [PATCH] Add Experiment Description Support (#657) --- js/src/evaluation/_runner.ts | 6 ++++++ js/src/tests/evaluate.int.test.ts | 16 +++++++++++++++- python/langsmith/evaluation/_arunner.py | 13 +++++++++++++ python/langsmith/evaluation/_runner.py | 16 ++++++++++++++++ python/pyproject.toml | 2 +- python/tests/evaluation/test_evaluation.py | 4 +++- 6 files changed, 54 insertions(+), 3 deletions(-) diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts index 533b5e468..ef5dbe93b 100644 --- a/js/src/evaluation/_runner.ts +++ b/js/src/evaluation/_runner.ts @@ -94,6 +94,10 @@ export interface EvaluateOptions { * @default undefined */ experimentPrefix?: string; + /** + * A free-form description of the experiment. + */ + description?: string; /** * The maximum number of concurrent evaluations to run. * @default undefined @@ -152,6 +156,7 @@ class _ExperimentManager { _experimentName: string; _metadata: KVMap; + _description?: string; get experimentName(): string { if (this._experimentName) { @@ -297,6 +302,7 @@ class _ExperimentManager { projectName: this.experimentName, referenceDatasetId: firstExample.dataset_id, metadata: projectMetadata, + description: this._description, }); } catch (e) { if (String(e).includes("already exists")) { diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts index e445348b2..e0b2891d0 100644 --- a/js/src/tests/evaluate.int.test.ts +++ b/js/src/tests/evaluate.int.test.ts @@ -40,7 +40,10 @@ test("evaluate can evaluate", async () => { }; }; - const evalRes = await evaluate(targetFunc, { data: TESTING_DATASET_NAME }); + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + description: "Experiment from evaluate can evaluate integration test", + }); // console.log(evalRes.results) expect(evalRes.results).toHaveLength(2); @@ -88,6 +91,7 @@ test("evaluate can evaluate with RunEvaluator evaluators", async () => { const evalRes = await evaluate(targetFunc, { data: TESTING_DATASET_NAME, evaluators: [evaluator], + description: "evaluate can evaluate with RunEvaluator evaluators", }); expect(evalRes.results).toHaveLength(2); @@ -159,6 +163,7 @@ test("evaluate can evaluate with custom evaluators", async () => { const evalRes = await evaluate(targetFunc, { data: TESTING_DATASET_NAME, evaluators: [customEvaluator], + description: "evaluate can evaluate with custom evaluators", }); expect(evalRes.results).toHaveLength(2); @@ -235,6 +240,7 @@ test("evaluate can evaluate with summary evaluators", async () => { const evalRes = await evaluate(targetFunc, { data: TESTING_DATASET_NAME, summaryEvaluators: [customSummaryEvaluator], + description: "evaluate can evaluate with summary evaluators", }); expect(evalRes.summaryResults.results).toHaveLength(1); @@ -289,6 +295,7 @@ test.skip("can iterate over evaluate results", async () => { const evalRes = await evaluate(targetFunc, { data: TESTING_DATASET_NAME, evaluators: [evaluator], + description: "can iterate over evaluate results", }); for await (const item of evalRes) { @@ -329,6 +336,7 @@ test("can pass multiple evaluators", async () => { const evalRes = await evaluate(targetFunc, { data: TESTING_DATASET_NAME, evaluators: evaluators, + description: "can pass multiple evaluators", }); expect(evalRes.results).toHaveLength(2); const firstEvalResults = evalRes.results[0]; @@ -370,6 +378,7 @@ test("can pass multiple summary evaluators", async () => { const evalRes = await evaluate(targetFunc, { data: TESTING_DATASET_NAME, summaryEvaluators, + description: "can pass multiple summary evaluators", }); expect(evalRes.results).toHaveLength(2); @@ -415,6 +424,7 @@ test("can pass AsyncIterable of Example's to evaluator instead of dataset name", const evalRes = await evaluate(targetFunc, { data: examplesIterator, evaluators: [customEvaluator], + description: "can pass AsyncIterable of Example's to evaluator", }); const firstEvalResults = evalRes.results[0]; @@ -449,6 +459,7 @@ test("max concurrency works with custom evaluators", async () => { data: TESTING_DATASET_NAME, evaluators: [customEvaluator], maxConcurrency: 1, + description: "max concurrency works with custom evaluators", }); expect(evalRes.results).toHaveLength(2); @@ -489,6 +500,7 @@ test("max concurrency works with summary evaluators", async () => { data: TESTING_DATASET_NAME, summaryEvaluators: [customSummaryEvaluator], maxConcurrency: 1, + description: "max concurrency works with summary evaluators", }); expect(evalRes.results).toHaveLength(2); @@ -530,6 +542,7 @@ test("Target func can be a runnable", async () => { const evalRes = await evaluate(targetFunc, { data: TESTING_DATASET_NAME, evaluators: [evaluator], + description: "Target func can be a runnable", }); expect(evalRes.results).toHaveLength(2); @@ -580,6 +593,7 @@ test("evaluate can accept array of examples", async () => { const evalRes = await evaluate(targetFunc, { data: examples, evaluators: [customEvaluator], + description: "evaluate can accept array of examples", }); const firstEvalResults = evalRes.results[0]; diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py index 86073349a..eb344fdbe 100644 --- a/python/langsmith/evaluation/_arunner.py +++ b/python/langsmith/evaluation/_arunner.py @@ -59,6 +59,7 @@ async def aevaluate( summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None, metadata: Optional[dict] = None, experiment_prefix: Optional[str] = None, + description: Optional[str] = None, max_concurrency: Optional[int] = None, client: Optional[langsmith.Client] = None, blocking: bool = True, @@ -77,6 +78,7 @@ async def aevaluate( Defaults to None. experiment_prefix (Optional[str]): A prefix to provide for your experiment name. Defaults to None. + description (Optional[str]): A description of the experiment. max_concurrency (Optional[int]): The maximum number of concurrent evaluations to run. Defaults to None. client (Optional[langsmith.Client]): The LangSmith client to use. @@ -134,6 +136,7 @@ async def aevaluate( ... evaluators=[accuracy], ... summary_evaluators=[precision], ... experiment_prefix="My Experiment", + ... description="Evaluate the accuracy of the model asynchronously.", ... metadata={ ... "my-prompt-version": "abcd-1234", ... }, @@ -154,6 +157,7 @@ async def aevaluate( ... evaluators=[accuracy], ... summary_evaluators=[precision], ... experiment_prefix="My Subset Experiment", + ... description="Evaluate a subset of examples asynchronously.", ... ) ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... @@ -167,6 +171,7 @@ async def aevaluate( ... evaluators=[accuracy], ... summary_evaluators=[precision], ... experiment_prefix="My Streaming Experiment", + ... description="Streaming predictions for debugging.", ... blocking=False, ... ) ... ) # doctest: +ELLIPSIS @@ -186,6 +191,7 @@ async def aevaluate( ... evaluators=[accuracy], ... summary_evaluators=[precision], ... experiment_prefix="My Experiment Without Concurrency", + ... description="This was run without concurrency.", ... max_concurrency=0, ... ) ... ) # doctest: +ELLIPSIS @@ -205,6 +211,7 @@ async def aevaluate( ... evaluators=[helpfulness], ... summary_evaluators=[precision], ... experiment_prefix="My Helpful Experiment", + ... description="Applying async evaluators example.", ... ) ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... @@ -216,6 +223,7 @@ async def aevaluate( summary_evaluators=summary_evaluators, metadata=metadata, experiment_prefix=experiment_prefix, + description=description, max_concurrency=max_concurrency, client=client, blocking=blocking, @@ -333,6 +341,7 @@ async def _aevaluate( summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None, metadata: Optional[dict] = None, experiment_prefix: Optional[str] = None, + description: Optional[str] = None, max_concurrency: Optional[int] = None, client: Optional[langsmith.Client] = None, blocking: bool = True, @@ -353,6 +362,7 @@ async def _aevaluate( client=client, metadata=metadata, experiment=experiment_ or experiment_prefix, + description=description, runs=runs, ).astart() cache_dir = ls_utils.get_cache_dir(None) @@ -392,6 +402,7 @@ class _AsyncExperimentManager(_ExperimentManagerMixin): experiment (Optional[schemas.TracerSession]): The tracer session associated with the experiment. experiment_prefix (Optional[str]): The prefix for the experiment name. + description (Optional[str]): The description for the experiment. metadata (Optional[dict]): Additional metadata for the experiment. client (Optional[langsmith.Client]): The Langsmith client used for the experiment. @@ -411,11 +422,13 @@ def __init__( client: Optional[langsmith.Client] = None, evaluation_results: Optional[AsyncIterable[EvaluationResults]] = None, summary_results: Optional[AsyncIterable[EvaluationResults]] = None, + description: Optional[str] = None, ): super().__init__( experiment=experiment, metadata=metadata, client=client, + description=description, ) self._data = data self._examples: Optional[AsyncIterable[schemas.Example]] = None diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index d205734e3..6c00ed15f 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -77,6 +77,7 @@ def evaluate( summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None, metadata: Optional[dict] = None, experiment_prefix: Optional[str] = None, + description: Optional[str] = None, max_concurrency: Optional[int] = None, client: Optional[langsmith.Client] = None, blocking: bool = True, @@ -95,6 +96,7 @@ def evaluate( Defaults to None. experiment_prefix (Optional[str]): A prefix to provide for your experiment name. Defaults to None. + description (Optional[str]): A free-form text description for the experiment. max_concurrency (Optional[int]): The maximum number of concurrent evaluations to run. Defaults to None. client (Optional[langsmith.Client]): The LangSmith client to use. @@ -142,6 +144,8 @@ def evaluate( ... data=dataset_name, ... evaluators=[accuracy], ... summary_evaluators=[precision], + ... experiment_prefix="My Experiment", + ... description="Evaluating the accuracy of a simple prediction model.", ... metadata={ ... "my-prompt-version": "abcd-1234", ... }, @@ -158,6 +162,7 @@ def evaluate( ... evaluators=[accuracy], ... summary_evaluators=[precision], ... experiment_prefix="My Experiment", + ... description="Just testing a subset synchronously.", ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... @@ -168,6 +173,7 @@ def evaluate( ... data=dataset_name, ... evaluators=[accuracy], ... summary_evaluators=[precision], + ... description="I don't even have to block!", ... blocking=False, ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... @@ -200,6 +206,7 @@ def evaluate( ... prepare_data=prepare_criteria_data, ... ), ... ], + ... description="Evaluating with off-the-shelf LangChain evaluators.", ... summary_evaluators=[precision], ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... @@ -217,6 +224,7 @@ def evaluate( ... lc_predict.invoke, ... data=dataset_name, ... evaluators=[accuracy], + ... description="This time we're evaluating a LangChain object.", ... summary_evaluators=[precision], ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... @@ -228,6 +236,7 @@ def evaluate( summary_evaluators=summary_evaluators, metadata=metadata, experiment_prefix=experiment_prefix, + description=description, max_concurrency=max_concurrency, client=client, blocking=blocking, @@ -413,6 +422,7 @@ def _evaluate( summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None, metadata: Optional[dict] = None, experiment_prefix: Optional[str] = None, + description: Optional[str] = None, max_concurrency: Optional[int] = None, client: Optional[langsmith.Client] = None, blocking: bool = True, @@ -432,6 +442,7 @@ def _evaluate( client=client, metadata=metadata, experiment=experiment_ or experiment_prefix, + description=description, # If provided, we don't need to create a new experiment. runs=runs, # Create or resolve the experiment. @@ -525,6 +536,7 @@ def __init__( experiment: Optional[Union[schemas.TracerSession, str]], metadata: Optional[dict] = None, client: Optional[langsmith.Client] = None, + description: Optional[str] = None, ): self.client = client or langsmith.Client() self._experiment: Optional[schemas.TracerSession] = None @@ -545,6 +557,7 @@ def __init__( **metadata, } self._metadata = metadata or {} + self._description = description @property def experiment_name(self) -> str: @@ -580,6 +593,7 @@ def _get_project(self, first_example: schemas.Example) -> schemas.TracerSession: project_metadata = self._get_experiment_metadata() project = self.client.create_project( self.experiment_name, + description=self._description, reference_dataset_id=first_example.dataset_id, metadata=project_metadata, ) @@ -649,11 +663,13 @@ def __init__( runs: Optional[Iterable[schemas.Run]] = None, evaluation_results: Optional[Iterable[EvaluationResults]] = None, summary_results: Optional[Iterable[EvaluationResults]] = None, + description: Optional[str] = None, ): super().__init__( experiment=experiment, metadata=metadata, client=client, + description=description, ) self._data = data self._examples: Optional[Iterable[schemas.Example]] = None diff --git a/python/pyproject.toml b/python/pyproject.toml index 6ed7a77b6..5899d06cf 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langsmith" -version = "0.1.53" +version = "0.1.54" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." authors = ["LangChain "] license = "MIT" diff --git a/python/tests/evaluation/test_evaluation.py b/python/tests/evaluation/test_evaluation.py index ffb3076a8..234aeb6f0 100644 --- a/python/tests/evaluation/test_evaluation.py +++ b/python/tests/evaluation/test_evaluation.py @@ -34,6 +34,7 @@ def predict(inputs: dict) -> dict: data=dataset_name, evaluators=[accuracy], summary_evaluators=[precision], + description="My sync experiment", metadata={ "my-prompt-version": "abcd-1234", "function": "evaluate", @@ -70,9 +71,10 @@ async def apredict(inputs: dict) -> dict: evaluators=[accuracy], summary_evaluators=[precision], experiment_prefix="My Experiment", + description="My Experiment Description", metadata={ "my-prompt-version": "abcd-1234", - "function": "agevaluate", + "function": "aevaluate", }, )