diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index f28d73ede..e862470f7 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -42,7 +42,7 @@ jobs: - name: Install dependencies run: | poetry install --with dev - poetry run pip install -U langchain + poetry run pip install -U langchain langchain_anthropic langchain_openai rapidfuzz - name: Run Python integration tests uses: ./.github/actions/python-integration-tests with: diff --git a/js/src/client.ts b/js/src/client.ts index df427edbb..995d59668 100644 --- a/js/src/client.ts +++ b/js/src/client.ts @@ -239,7 +239,7 @@ export type CreateExampleOptions = { exampleId?: string; metadata?: KVMap; - split?: string; + split?: string | string[]; }; type AutoBatchQueueItem = { @@ -2036,7 +2036,7 @@ export class Client { inputs: Array; outputs?: Array; metadata?: Array; - splits?: Array; + splits?: Array>; sourceRunIds?: Array; exampleIds?: Array; datasetId?: string; diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts index cdced3c67..f3f981dae 100644 --- a/js/src/evaluation/_runner.ts +++ b/js/src/evaluation/_runner.ts @@ -694,6 +694,21 @@ class _ExperimentManager { ).date; } + async _getDatasetSplits(): Promise { + const examples = await this.getExamples(); + const allSplits = examples.reduce((acc, ex) => { + if (ex.metadata && ex.metadata.dataset_split) { + if (Array.isArray(ex.metadata.dataset_split)) { + ex.metadata.dataset_split.forEach((split) => acc.add(split)); + } else if (typeof ex.metadata.dataset_split === "string") { + acc.add(ex.metadata.dataset_split); + } + } + return acc; + }, new Set()); + return allSplits.size ? Array.from(allSplits) : undefined; + } + async _end(): Promise { const experiment = this._experiment; if (!experiment) { @@ -701,6 +716,7 @@ class _ExperimentManager { } const projectMetadata = await this._getExperimentMetadata(); projectMetadata["dataset_version"] = await this._getDatasetVersion(); + projectMetadata["dataset_splits"] = await this._getDatasetSplits(); // Update revision_id if not already set if (!projectMetadata["revision_id"]) { projectMetadata["revision_id"] = await getDefaultRevisionId(); diff --git a/js/src/schemas.ts b/js/src/schemas.ts index ee8a11036..f33ee1f80 100644 --- a/js/src/schemas.ts +++ b/js/src/schemas.ts @@ -229,7 +229,7 @@ export interface RunUpdate { export interface ExampleCreate extends BaseExample { id?: string; created_at?: string; - split?: string; + split?: string | string[]; } export interface Example extends BaseExample { @@ -245,7 +245,7 @@ export interface ExampleUpdate { inputs?: KVMap; outputs?: KVMap; metadata?: KVMap; - split?: string; + split?: string | string[]; } export interface BaseDataset { name: string; diff --git a/js/src/tests/client.int.test.ts b/js/src/tests/client.int.test.ts index 7637bb821..0b87522e9 100644 --- a/js/src/tests/client.int.test.ts +++ b/js/src/tests/client.int.test.ts @@ -97,12 +97,22 @@ test.concurrent("Test LangSmith Client Dataset CRD", async () => { await client.updateExample(example.id, { inputs: { col1: "updatedExampleCol1" }, outputs: { col2: "updatedExampleCol2" }, - split: "my_split2", + split: ["my_split2"], }); // Says 'example updated' or something similar const newExampleValue = await client.readExample(example.id); expect(newExampleValue.inputs.col1).toBe("updatedExampleCol1"); - expect(newExampleValue.metadata?.dataset_split).toBe("my_split2"); + expect(newExampleValue.metadata?.dataset_split).toStrictEqual(["my_split2"]); + + await client.updateExample(example.id, { + inputs: { col1: "updatedExampleCol3" }, + outputs: { col2: "updatedExampleCol4" }, + split: "my_split3", + }); + // Says 'example updated' or something similar + const newExampleValue2 = await client.readExample(example.id); + expect(newExampleValue2.inputs.col1).toBe("updatedExampleCol3"); + expect(newExampleValue2.metadata?.dataset_split).toStrictEqual(["my_split3"]); await client.deleteExample(example.id); const examples2 = await toArray( client.listExamples({ datasetId: newDataset.id }) @@ -489,7 +499,7 @@ test.concurrent( { output: "hi there 3" }, ], metadata: [{ key: "value 1" }, { key: "value 2" }, { key: "value 3" }], - splits: ["train", "test", "train"], + splits: ["train", "test", ["train", "validation"]], datasetId: dataset.id, }); const initialExamplesList = await toArray( @@ -520,19 +530,20 @@ test.concurrent( ); expect(example1?.outputs?.output).toEqual("hi there 1"); expect(example1?.metadata?.key).toEqual("value 1"); - expect(example1?.metadata?.dataset_split).toEqual("train"); + expect(example1?.metadata?.dataset_split).toEqual(["train"]); const example2 = examplesList2.find( (e) => e.inputs.input === "hello world 2" ); expect(example2?.outputs?.output).toEqual("hi there 2"); expect(example2?.metadata?.key).toEqual("value 2"); - expect(example2?.metadata?.dataset_split).toEqual("test"); + expect(example2?.metadata?.dataset_split).toEqual(["test"]); const example3 = examplesList2.find( (e) => e.inputs.input === "hello world 3" ); expect(example3?.outputs?.output).toEqual("hi there 3"); expect(example3?.metadata?.key).toEqual("value 3"); - expect(example3?.metadata?.dataset_split).toEqual("train"); + expect(example3?.metadata?.dataset_split).toContain("train"); + expect(example3?.metadata?.dataset_split).toContain("validation"); await client.createExample( { input: "hello world" }, diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts index 198f66473..5e1321d06 100644 --- a/js/src/tests/evaluate.int.test.ts +++ b/js/src/tests/evaluate.int.test.ts @@ -1,6 +1,6 @@ import { EvaluationResult } from "../evaluation/evaluator.js"; import { evaluate } from "../evaluation/_runner.js"; -import { Example, Run } from "../schemas.js"; +import { Example, Run, TracerSession } from "../schemas.js"; import { Client } from "../index.js"; import { afterAll, beforeAll } from "@jest/globals"; import { RunnableLambda } from "@langchain/core/runnables"; @@ -30,6 +30,13 @@ afterAll(async () => { await client.deleteDataset({ datasetName: TESTING_DATASET_NAME, }); + try { + await client.deleteDataset({ + datasetName: "my_splits_ds2", + }); + } catch (_) { + //pass + } }); test("evaluate can evaluate", async () => { @@ -351,6 +358,82 @@ test("can pass multiple evaluators", async () => { ); }); +test("split info saved correctly", async () => { + const client = new Client(); + // create a new dataset + await client.createDataset("my_splits_ds2", { + description: + "For testing purposed. Is created & deleted for each test run.", + }); + // create examples + await client.createExamples({ + inputs: [{ input: 1 }, { input: 2 }, { input: 3 }], + outputs: [{ output: 2 }, { output: 3 }, { output: 4 }], + splits: [["test"], ["train"], ["validation", "test"]], + datasetName: "my_splits_ds2", + }); + + const targetFunc = (input: Record) => { + console.log("__input__", input); + return { + foo: input.input + 1, + }; + }; + await evaluate(targetFunc, { + data: client.listExamples({ datasetName: "my_splits_ds2" }), + description: "splits info saved correctly", + }); + + const exp = client.listProjects({ referenceDatasetName: "my_splits_ds2" }); + let myExp: TracerSession | null = null; + for await (const session of exp) { + myExp = session; + } + expect(myExp?.extra?.metadata?.dataset_splits.sort()).toEqual( + ["test", "train", "validation"].sort() + ); + + await evaluate(targetFunc, { + data: client.listExamples({ + datasetName: "my_splits_ds2", + splits: ["test"], + }), + description: "splits info saved correctly", + }); + + const exp2 = client.listProjects({ referenceDatasetName: "my_splits_ds2" }); + let myExp2: TracerSession | null = null; + for await (const session of exp2) { + if (myExp2 === null || session.start_time > myExp2.start_time) { + myExp2 = session; + } + } + + expect(myExp2?.extra?.metadata?.dataset_splits.sort()).toEqual( + ["test", "validation"].sort() + ); + + await evaluate(targetFunc, { + data: client.listExamples({ + datasetName: "my_splits_ds2", + splits: ["train"], + }), + description: "splits info saved correctly", + }); + + const exp3 = client.listProjects({ referenceDatasetName: "my_splits_ds2" }); + let myExp3: TracerSession | null = null; + for await (const session of exp3) { + if (myExp3 === null || session.start_time > myExp3.start_time) { + myExp3 = session; + } + } + + expect(myExp3?.extra?.metadata?.dataset_splits.sort()).toEqual( + ["train"].sort() + ); +}); + test("can pass multiple summary evaluators", async () => { const targetFunc = (input: Record) => { console.log("__input__", input); diff --git a/python/langsmith/client.py b/python/langsmith/client.py index 9a822fd90..f63a47f65 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -2936,7 +2936,7 @@ def create_examples( inputs: Sequence[Mapping[str, Any]], outputs: Optional[Sequence[Optional[Mapping[str, Any]]]] = None, metadata: Optional[Sequence[Optional[Mapping[str, Any]]]] = None, - splits: Optional[Sequence[Optional[str]]] = None, + splits: Optional[Sequence[Optional[str | List[str]]]] = None, source_run_ids: Optional[Sequence[Optional[ID_TYPE]]] = None, ids: Optional[Sequence[Optional[ID_TYPE]]] = None, dataset_id: Optional[ID_TYPE] = None, @@ -2953,6 +2953,9 @@ def create_examples( The output values for the examples. metadata : Optional[Sequence[Optional[Mapping[str, Any]]]], default=None The metadata for the examples. + split : Optional[Sequence[Optional[str | List[str]]]], default=None + The splits for the examples, which are divisions + of your dataset such as 'train', 'test', or 'validation'. source_run_ids : Optional[Sequence[Optional[ID_TYPE]]], default=None The IDs of the source runs associated with the examples. ids : Optional[Sequence[ID_TYPE]], default=None @@ -3012,7 +3015,7 @@ def create_example( created_at: Optional[datetime.datetime] = None, outputs: Optional[Mapping[str, Any]] = None, metadata: Optional[Mapping[str, Any]] = None, - split: Optional[str] = None, + split: Optional[str | List[str]] = None, example_id: Optional[ID_TYPE] = None, ) -> ls_schemas.Example: """Create a dataset example in the LangSmith API. @@ -3034,6 +3037,9 @@ def create_example( The output values for the example. metadata : Mapping[str, Any] or None, default=None The metadata for the example. + split : str or List[str] or None, default=None + The splits for the example, which are divisions + of your dataset such as 'train', 'test', or 'validation'. exemple_id : UUID or None, default=None The ID of the example to create. If not provided, a new example will be created. @@ -3165,7 +3171,7 @@ def update_example( inputs: Optional[Dict[str, Any]] = None, outputs: Optional[Mapping[str, Any]] = None, metadata: Optional[Dict] = None, - split: Optional[str] = None, + split: Optional[str | List[str]] = None, dataset_id: Optional[ID_TYPE] = None, ) -> Dict[str, Any]: """Update a specific example. @@ -3180,6 +3186,9 @@ def update_example( The output values to update. metadata : Dict or None, default=None The metadata to update. + split : str or List[str] or None, default=None + The dataset split to update, such as + 'train', 'test', or 'validation'. dataset_id : UUID or None, default=None The ID of the dataset to update. diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index 3c07ed165..27910b90b 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -1322,6 +1322,23 @@ def _get_dataset_version(self) -> Optional[str]: max_modified_at = max(modified_at) if modified_at else None return max_modified_at.isoformat() if max_modified_at else None + def _get_dataset_splits(self) -> Optional[list[str]]: + examples = list(self.examples) + splits = set() + for example in examples: + if ( + example.metadata + and example.metadata.get("dataset_split") + and isinstance(example.metadata["dataset_split"], list) + ): + for split in example.metadata["dataset_split"]: + if isinstance(split, str): + splits.add(split) + else: + splits.add("base") + + return list(splits) + def _end(self) -> None: experiment = self._experiment if experiment is None: @@ -1329,6 +1346,7 @@ def _end(self) -> None: project_metadata = self._get_experiment_metadata() project_metadata["dataset_version"] = self._get_dataset_version() + project_metadata["dataset_splits"] = self._get_dataset_splits() self.client.update_project( experiment.id, end_time=datetime.datetime.now(datetime.timezone.utc), diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py index ee57ffd32..758530e03 100644 --- a/python/langsmith/schemas.py +++ b/python/langsmith/schemas.py @@ -63,7 +63,7 @@ class ExampleCreate(ExampleBase): id: Optional[UUID] created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) - split: Optional[str] = None + split: Optional[Union[str, List[str]]] = None class Example(ExampleBase): @@ -106,7 +106,7 @@ class ExampleUpdate(BaseModel): inputs: Optional[Dict[str, Any]] = None outputs: Optional[Dict[str, Any]] = None metadata: Optional[Dict[str, Any]] = None - split: Optional[str] = None + split: Optional[Union[str, List[str]]] = None class Config: """Configuration class for the schema.""" diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index c037bfd65..9107cc9f9 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -109,9 +109,9 @@ def test_datasets(langchain_client: Client) -> None: def test_list_examples(langchain_client: Client) -> None: """Test list_examples.""" examples = [ - ("Shut up, idiot", "Toxic", "train"), + ("Shut up, idiot", "Toxic", ["train", "validation"]), ("You're a wonderful person", "Not toxic", "test"), - ("This is the worst thing ever", "Toxic", "train"), + ("This is the worst thing ever", "Toxic", ["train"]), ("I had a great day today", "Not toxic", "test"), ("Nobody likes you", "Toxic", "train"), ("This is unacceptable. I want to speak to the manager.", "Not toxic", None), @@ -133,6 +133,11 @@ def test_list_examples(langchain_client: Client) -> None: ) assert len(example_list) == 3 + example_list = list( + langchain_client.list_examples(dataset_id=dataset.id, splits=["validation"]) + ) + assert len(example_list) == 1 + example_list = list( langchain_client.list_examples(dataset_id=dataset.id, splits=["test"]) ) @@ -148,11 +153,21 @@ def test_list_examples(langchain_client: Client) -> None: example.id for example in example_list if example.metadata is not None - and example.metadata.get("dataset_split") == "test" + and "test" in example.metadata.get("dataset_split", []) ][0], split="train", ) + example_list = list( + langchain_client.list_examples(dataset_id=dataset.id, splits=["test"]) + ) + assert len(example_list) == 1 + + example_list = list( + langchain_client.list_examples(dataset_id=dataset.id, splits=["train"]) + ) + assert len(example_list) == 4 + langchain_client.create_example( inputs={"text": "What's up!"}, outputs={"label": "Not toxic"},