Skip to content

Commit

Permalink
Add Experiment Description Support (#657)
Browse files Browse the repository at this point in the history
  • Loading branch information
hinthornw authored May 4, 2024
1 parent 73da8f1 commit 6f6e230
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 3 deletions.
6 changes: 6 additions & 0 deletions js/src/evaluation/_runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ export interface EvaluateOptions {
* @default undefined
*/
experimentPrefix?: string;
/**
* A free-form description of the experiment.
*/
description?: string;
/**
* The maximum number of concurrent evaluations to run.
* @default undefined
Expand Down Expand Up @@ -152,6 +156,7 @@ class _ExperimentManager {
_experimentName: string;

_metadata: KVMap;
_description?: string;

get experimentName(): string {
if (this._experimentName) {
Expand Down Expand Up @@ -297,6 +302,7 @@ class _ExperimentManager {
projectName: this.experimentName,
referenceDatasetId: firstExample.dataset_id,
metadata: projectMetadata,
description: this._description,
});
} catch (e) {
if (String(e).includes("already exists")) {
Expand Down
16 changes: 15 additions & 1 deletion js/src/tests/evaluate.int.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ test("evaluate can evaluate", async () => {
};
};

const evalRes = await evaluate(targetFunc, { data: TESTING_DATASET_NAME });
const evalRes = await evaluate(targetFunc, {
data: TESTING_DATASET_NAME,
description: "Experiment from evaluate can evaluate integration test",
});
// console.log(evalRes.results)
expect(evalRes.results).toHaveLength(2);

Expand Down Expand Up @@ -88,6 +91,7 @@ test("evaluate can evaluate with RunEvaluator evaluators", async () => {
const evalRes = await evaluate(targetFunc, {
data: TESTING_DATASET_NAME,
evaluators: [evaluator],
description: "evaluate can evaluate with RunEvaluator evaluators",
});

expect(evalRes.results).toHaveLength(2);
Expand Down Expand Up @@ -159,6 +163,7 @@ test("evaluate can evaluate with custom evaluators", async () => {
const evalRes = await evaluate(targetFunc, {
data: TESTING_DATASET_NAME,
evaluators: [customEvaluator],
description: "evaluate can evaluate with custom evaluators",
});

expect(evalRes.results).toHaveLength(2);
Expand Down Expand Up @@ -235,6 +240,7 @@ test("evaluate can evaluate with summary evaluators", async () => {
const evalRes = await evaluate(targetFunc, {
data: TESTING_DATASET_NAME,
summaryEvaluators: [customSummaryEvaluator],
description: "evaluate can evaluate with summary evaluators",
});

expect(evalRes.summaryResults.results).toHaveLength(1);
Expand Down Expand Up @@ -289,6 +295,7 @@ test.skip("can iterate over evaluate results", async () => {
const evalRes = await evaluate(targetFunc, {
data: TESTING_DATASET_NAME,
evaluators: [evaluator],
description: "can iterate over evaluate results",
});

for await (const item of evalRes) {
Expand Down Expand Up @@ -329,6 +336,7 @@ test("can pass multiple evaluators", async () => {
const evalRes = await evaluate(targetFunc, {
data: TESTING_DATASET_NAME,
evaluators: evaluators,
description: "can pass multiple evaluators",
});
expect(evalRes.results).toHaveLength(2);
const firstEvalResults = evalRes.results[0];
Expand Down Expand Up @@ -370,6 +378,7 @@ test("can pass multiple summary evaluators", async () => {
const evalRes = await evaluate(targetFunc, {
data: TESTING_DATASET_NAME,
summaryEvaluators,
description: "can pass multiple summary evaluators",
});
expect(evalRes.results).toHaveLength(2);

Expand Down Expand Up @@ -415,6 +424,7 @@ test("can pass AsyncIterable of Example's to evaluator instead of dataset name",
const evalRes = await evaluate(targetFunc, {
data: examplesIterator,
evaluators: [customEvaluator],
description: "can pass AsyncIterable of Example's to evaluator",
});

const firstEvalResults = evalRes.results[0];
Expand Down Expand Up @@ -449,6 +459,7 @@ test("max concurrency works with custom evaluators", async () => {
data: TESTING_DATASET_NAME,
evaluators: [customEvaluator],
maxConcurrency: 1,
description: "max concurrency works with custom evaluators",
});

expect(evalRes.results).toHaveLength(2);
Expand Down Expand Up @@ -489,6 +500,7 @@ test("max concurrency works with summary evaluators", async () => {
data: TESTING_DATASET_NAME,
summaryEvaluators: [customSummaryEvaluator],
maxConcurrency: 1,
description: "max concurrency works with summary evaluators",
});

expect(evalRes.results).toHaveLength(2);
Expand Down Expand Up @@ -530,6 +542,7 @@ test("Target func can be a runnable", async () => {
const evalRes = await evaluate(targetFunc, {
data: TESTING_DATASET_NAME,
evaluators: [evaluator],
description: "Target func can be a runnable",
});

expect(evalRes.results).toHaveLength(2);
Expand Down Expand Up @@ -580,6 +593,7 @@ test("evaluate can accept array of examples", async () => {
const evalRes = await evaluate(targetFunc, {
data: examples,
evaluators: [customEvaluator],
description: "evaluate can accept array of examples",
});

const firstEvalResults = evalRes.results[0];
Expand Down
13 changes: 13 additions & 0 deletions python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ async def aevaluate(
summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
description: Optional[str] = None,
max_concurrency: Optional[int] = None,
client: Optional[langsmith.Client] = None,
blocking: bool = True,
Expand All @@ -77,6 +78,7 @@ async def aevaluate(
Defaults to None.
experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
Defaults to None.
description (Optional[str]): A description of the experiment.
max_concurrency (Optional[int]): The maximum number of concurrent
evaluations to run. Defaults to None.
client (Optional[langsmith.Client]): The LangSmith client to use.
Expand Down Expand Up @@ -134,6 +136,7 @@ async def aevaluate(
... evaluators=[accuracy],
... summary_evaluators=[precision],
... experiment_prefix="My Experiment",
... description="Evaluate the accuracy of the model asynchronously.",
... metadata={
... "my-prompt-version": "abcd-1234",
... },
Expand All @@ -154,6 +157,7 @@ async def aevaluate(
... evaluators=[accuracy],
... summary_evaluators=[precision],
... experiment_prefix="My Subset Experiment",
... description="Evaluate a subset of examples asynchronously.",
... )
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
Expand All @@ -167,6 +171,7 @@ async def aevaluate(
... evaluators=[accuracy],
... summary_evaluators=[precision],
... experiment_prefix="My Streaming Experiment",
... description="Streaming predictions for debugging.",
... blocking=False,
... )
... ) # doctest: +ELLIPSIS
Expand All @@ -186,6 +191,7 @@ async def aevaluate(
... evaluators=[accuracy],
... summary_evaluators=[precision],
... experiment_prefix="My Experiment Without Concurrency",
... description="This was run without concurrency.",
... max_concurrency=0,
... )
... ) # doctest: +ELLIPSIS
Expand All @@ -205,6 +211,7 @@ async def aevaluate(
... evaluators=[helpfulness],
... summary_evaluators=[precision],
... experiment_prefix="My Helpful Experiment",
... description="Applying async evaluators example.",
... )
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
Expand All @@ -216,6 +223,7 @@ async def aevaluate(
summary_evaluators=summary_evaluators,
metadata=metadata,
experiment_prefix=experiment_prefix,
description=description,
max_concurrency=max_concurrency,
client=client,
blocking=blocking,
Expand Down Expand Up @@ -333,6 +341,7 @@ async def _aevaluate(
summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
description: Optional[str] = None,
max_concurrency: Optional[int] = None,
client: Optional[langsmith.Client] = None,
blocking: bool = True,
Expand All @@ -353,6 +362,7 @@ async def _aevaluate(
client=client,
metadata=metadata,
experiment=experiment_ or experiment_prefix,
description=description,
runs=runs,
).astart()
cache_dir = ls_utils.get_cache_dir(None)
Expand Down Expand Up @@ -392,6 +402,7 @@ class _AsyncExperimentManager(_ExperimentManagerMixin):
experiment (Optional[schemas.TracerSession]): The tracer session
associated with the experiment.
experiment_prefix (Optional[str]): The prefix for the experiment name.
description (Optional[str]): The description for the experiment.
metadata (Optional[dict]): Additional metadata for the experiment.
client (Optional[langsmith.Client]): The Langsmith client used for
the experiment.
Expand All @@ -411,11 +422,13 @@ def __init__(
client: Optional[langsmith.Client] = None,
evaluation_results: Optional[AsyncIterable[EvaluationResults]] = None,
summary_results: Optional[AsyncIterable[EvaluationResults]] = None,
description: Optional[str] = None,
):
super().__init__(
experiment=experiment,
metadata=metadata,
client=client,
description=description,
)
self._data = data
self._examples: Optional[AsyncIterable[schemas.Example]] = None
Expand Down
16 changes: 16 additions & 0 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def evaluate(
summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
description: Optional[str] = None,
max_concurrency: Optional[int] = None,
client: Optional[langsmith.Client] = None,
blocking: bool = True,
Expand All @@ -95,6 +96,7 @@ def evaluate(
Defaults to None.
experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
Defaults to None.
description (Optional[str]): A free-form text description for the experiment.
max_concurrency (Optional[int]): The maximum number of concurrent
evaluations to run. Defaults to None.
client (Optional[langsmith.Client]): The LangSmith client to use.
Expand Down Expand Up @@ -142,6 +144,8 @@ def evaluate(
... data=dataset_name,
... evaluators=[accuracy],
... summary_evaluators=[precision],
... experiment_prefix="My Experiment",
... description="Evaluating the accuracy of a simple prediction model.",
... metadata={
... "my-prompt-version": "abcd-1234",
... },
Expand All @@ -158,6 +162,7 @@ def evaluate(
... evaluators=[accuracy],
... summary_evaluators=[precision],
... experiment_prefix="My Experiment",
... description="Just testing a subset synchronously.",
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
Expand All @@ -168,6 +173,7 @@ def evaluate(
... data=dataset_name,
... evaluators=[accuracy],
... summary_evaluators=[precision],
... description="I don't even have to block!",
... blocking=False,
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
Expand Down Expand Up @@ -200,6 +206,7 @@ def evaluate(
... prepare_data=prepare_criteria_data,
... ),
... ],
... description="Evaluating with off-the-shelf LangChain evaluators.",
... summary_evaluators=[precision],
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
Expand All @@ -217,6 +224,7 @@ def evaluate(
... lc_predict.invoke,
... data=dataset_name,
... evaluators=[accuracy],
... description="This time we're evaluating a LangChain object.",
... summary_evaluators=[precision],
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
Expand All @@ -228,6 +236,7 @@ def evaluate(
summary_evaluators=summary_evaluators,
metadata=metadata,
experiment_prefix=experiment_prefix,
description=description,
max_concurrency=max_concurrency,
client=client,
blocking=blocking,
Expand Down Expand Up @@ -413,6 +422,7 @@ def _evaluate(
summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
description: Optional[str] = None,
max_concurrency: Optional[int] = None,
client: Optional[langsmith.Client] = None,
blocking: bool = True,
Expand All @@ -432,6 +442,7 @@ def _evaluate(
client=client,
metadata=metadata,
experiment=experiment_ or experiment_prefix,
description=description,
# If provided, we don't need to create a new experiment.
runs=runs,
# Create or resolve the experiment.
Expand Down Expand Up @@ -525,6 +536,7 @@ def __init__(
experiment: Optional[Union[schemas.TracerSession, str]],
metadata: Optional[dict] = None,
client: Optional[langsmith.Client] = None,
description: Optional[str] = None,
):
self.client = client or langsmith.Client()
self._experiment: Optional[schemas.TracerSession] = None
Expand All @@ -545,6 +557,7 @@ def __init__(
**metadata,
}
self._metadata = metadata or {}
self._description = description

@property
def experiment_name(self) -> str:
Expand Down Expand Up @@ -580,6 +593,7 @@ def _get_project(self, first_example: schemas.Example) -> schemas.TracerSession:
project_metadata = self._get_experiment_metadata()
project = self.client.create_project(
self.experiment_name,
description=self._description,
reference_dataset_id=first_example.dataset_id,
metadata=project_metadata,
)
Expand Down Expand Up @@ -649,11 +663,13 @@ def __init__(
runs: Optional[Iterable[schemas.Run]] = None,
evaluation_results: Optional[Iterable[EvaluationResults]] = None,
summary_results: Optional[Iterable[EvaluationResults]] = None,
description: Optional[str] = None,
):
super().__init__(
experiment=experiment,
metadata=metadata,
client=client,
description=description,
)
self._data = data
self._examples: Optional[Iterable[schemas.Example]] = None
Expand Down
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langsmith"
version = "0.1.53"
version = "0.1.54"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
authors = ["LangChain <[email protected]>"]
license = "MIT"
Expand Down
4 changes: 3 additions & 1 deletion python/tests/evaluation/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def predict(inputs: dict) -> dict:
data=dataset_name,
evaluators=[accuracy],
summary_evaluators=[precision],
description="My sync experiment",
metadata={
"my-prompt-version": "abcd-1234",
"function": "evaluate",
Expand Down Expand Up @@ -70,9 +71,10 @@ async def apredict(inputs: dict) -> dict:
evaluators=[accuracy],
summary_evaluators=[precision],
experiment_prefix="My Experiment",
description="My Experiment Description",
metadata={
"my-prompt-version": "abcd-1234",
"function": "agevaluate",
"function": "aevaluate",
},
)

Expand Down

0 comments on commit 6f6e230

Please sign in to comment.