-
Notifications
You must be signed in to change notification settings - Fork 87
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: schema validation in langsmith sdk #922
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
|
||
import pytest | ||
from freezegun import freeze_time | ||
from pydantic import BaseModel | ||
|
||
from langsmith.client import ID_TYPE, Client | ||
from langsmith.schemas import DataType | ||
|
@@ -312,11 +313,7 @@ def test_error_surfaced_invalid_uri(monkeypatch: pytest.MonkeyPatch, uri: str) - | |
client.create_run("My Run", inputs={"text": "hello world"}, run_type="llm") | ||
|
||
|
||
def test_create_dataset( | ||
monkeypatch: pytest.MonkeyPatch, langchain_client: Client | ||
) -> None: | ||
"""Test persisting runs and adding feedback.""" | ||
monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://dev.api.smith.langchain.com") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was a quirk where we were overriding a single test to use dev. If we wanna test against dev, we should just configure the suite to run against dev in addition |
||
def test_create_dataset(langchain_client: Client) -> None: | ||
dataset_name = "__test_create_dataset" + uuid4().hex[:4] | ||
if langchain_client.has_dataset(dataset_name=dataset_name): | ||
langchain_client.delete_dataset(dataset_name=dataset_name) | ||
|
@@ -360,6 +357,59 @@ def test_create_dataset( | |
langchain_client.delete_dataset(dataset_id=dataset.id) | ||
|
||
|
||
def test_dataset_schema_validation(langchain_client: Client) -> None: | ||
dataset_name = "__test_create_dataset" + uuid4().hex[:4] | ||
if langchain_client.has_dataset(dataset_name=dataset_name): | ||
langchain_client.delete_dataset(dataset_name=dataset_name) | ||
|
||
class InputSchema(BaseModel): | ||
input: str | ||
|
||
class OutputSchema(BaseModel): | ||
output: str | ||
|
||
dataset = langchain_client.create_dataset( | ||
dataset_name, | ||
data_type=DataType.kv, | ||
inputs_schema=InputSchema.model_json_schema(), | ||
outputs_schema=OutputSchema.model_json_schema(), | ||
) | ||
|
||
# confirm we store the schema from the create request | ||
assert dataset.inputs_schema == InputSchema.model_json_schema() | ||
assert dataset.outputs_schema == OutputSchema.model_json_schema() | ||
|
||
# create an example that matches the schema, which should succeed | ||
langchain_client.create_example( | ||
inputs={"input": "hello world"}, | ||
outputs={"output": "hello"}, | ||
dataset_id=dataset.id, | ||
) | ||
|
||
# create an example that does not match the input schema | ||
with pytest.raises(LangSmithError): | ||
langchain_client.create_example( | ||
inputs={"john": 1}, | ||
outputs={"output": "hello"}, | ||
dataset_id=dataset.id, | ||
) | ||
|
||
# create an example that does not match the output schema | ||
with pytest.raises(LangSmithError): | ||
langchain_client.create_example( | ||
inputs={"input": "hello world"}, | ||
outputs={"john": 1}, | ||
dataset_id=dataset.id, | ||
) | ||
|
||
# assert read API includes the schema definition | ||
read_dataset = langchain_client.read_dataset(dataset_id=dataset.id) | ||
assert read_dataset.inputs_schema == InputSchema.model_json_schema() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @hinthornw here's the integration test for reading the input schema back out |
||
assert read_dataset.outputs_schema == OutputSchema.model_json_schema() | ||
|
||
langchain_client.delete_dataset(dataset_id=dataset.id) | ||
|
||
|
||
@freeze_time("2023-01-01") | ||
def test_list_datasets(langchain_client: Client) -> None: | ||
ds1n = "__test_list_datasets1" + uuid4().hex[:4] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@hinthornw I had to do some pydantic magic to make all this work. Do we do this in the SDK? I see this pattern in runtree, but I know pydantic stuff is frowned upon other parts of the code base.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe we just don't use pydantic in thecreate_dataset method anymore?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we'll need to remove it from all dataset related areas then, because we'll need to do conversion on any read/create/etc