Skip to content

Commit

Permalink
Merge branch 'main' into dqbd/entrypoints-cjs
Browse files Browse the repository at this point in the history
  • Loading branch information
hinthornw authored Mar 27, 2024
2 parents 13c5dbb + 25808cf commit 843005f
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 39 deletions.
4 changes: 2 additions & 2 deletions js/src/schemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ export interface TracerSessionResult extends TracerSession {
last_run_start_time?: number;
// Feedback stats for the session.
feedback_stats?: Record<string, unknown>;
// The reference dataset IDs this session's runs were generated on.
reference_dataset_ids?: string[];
// The reference dataset ID this session's runs were generated on.
reference_dataset_id?: string;
// Facets for the runs in the session.
run_facets?: KVMap[];
}
Expand Down
104 changes: 70 additions & 34 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,23 +77,23 @@ def evaluate(
r"""Evaluate a target system or function on a given dataset.
Args:
target (TARGET_T): The target system or function to evaluate.
data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of
examples, or a generator of examples.
evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run
on each example. Defaults to None.
summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary
evaluators to run on the entire dataset. Defaults to None.
metadata (Optional[dict]): Metadata to attach to the experiment.
Defaults to None.
experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
Defaults to None.
max_concurrency (Optional[int]): The maximum number of concurrent
evaluations to run. Defaults to None.
client (Optional[langsmith.Client]): The LangSmith client to use.
Defaults to None.
blocking (bool): Whether to block until the evaluation is complete.
Defaults to True.
target (TARGET_T): The target system or function to evaluate.
data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of
examples, or a generator of examples.
evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run
on each example. Defaults to None.
summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary
evaluators to run on the entire dataset. Defaults to None.
metadata (Optional[dict]): Metadata to attach to the experiment.
Defaults to None.
experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
Defaults to None.
max_concurrency (Optional[int]): The maximum number of concurrent
evaluations to run. Defaults to None.
client (Optional[langsmith.Client]): The LangSmith client to use.
Defaults to None.
blocking (bool): Whether to block until the evaluation is complete.
Defaults to True.
Returns:
ExperimentResults: The results of the evaluation.
Expand Down Expand Up @@ -138,6 +138,9 @@ def evaluate(
... data=dataset_name,
... evaluators=[accuracy],
... summary_evaluators=[precision],
... metadata={
... "my-prompt-version": "abcd-1234",
... },
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
Expand Down Expand Up @@ -234,7 +237,6 @@ def evaluate(
def evaluate_existing(
experiment: Union[str, uuid.UUID],
/,
data: DATA_T,
evaluators: Optional[Sequence[EVALUATOR_T]] = None,
summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
metadata: Optional[dict] = None,
Expand Down Expand Up @@ -294,13 +296,21 @@ def evaluate_existing(
>>> experiment_name = "My Experiment:64e6e91" # Or manually specify
>>> results = evaluate_existing(
... experiment_name,
... data=dataset_name,
... summary_evaluators=[precision],
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
""" # noqa: E501
client = client or langsmith.Client()
project = _load_experiment(experiment, client)
runs = _load_traces(experiment, client, load_nested=load_nested)
data = list(
client.list_examples(
dataset_id=project.reference_dataset_id,
as_of=project.metadata.get("dataset_version"),
)
)
runs = sorted(runs, key=lambda r: str(r.reference_example_id))
data = sorted(data, key=lambda d: str(d.id))
return _evaluate(
runs,
data=data,
Expand Down Expand Up @@ -400,13 +410,15 @@ def _evaluate(
max_concurrency: Optional[int] = None,
client: Optional[langsmith.Client] = None,
blocking: bool = True,
experiment: Optional[schemas.TracerSession] = None,
) -> ExperimentResults:
# Initialize the experiment manager.
manager = _ExperimentManager(
data,
client=client,
metadata=metadata,
experiment_prefix=experiment_prefix,
experiment=experiment,
# If provided, we don't need to create a new experiment.
runs=None if _is_callable(target) else cast(Iterable[schemas.Run], target),
# Create or resolve the experiment.
Expand Down Expand Up @@ -438,17 +450,25 @@ def _is_uuid(value: str) -> bool:
return False


def _load_experiment(
project: Union[str, uuid.UUID], client: langsmith.Client
) -> schemas.TracerSession:
if isinstance(project, uuid.UUID) or _is_uuid(project):
return client.read_project(project_id=project)
return client.read_project(project_name=project)


def _load_traces(
project: Union[str, uuid.UUID], client: langsmith.Client, load_nested: bool = False
) -> Iterable[schemas.Run]:
) -> List[schemas.Run]:
"""Load nested traces for a given project."""
execution_order = None if load_nested else 1
if isinstance(project, uuid.UUID) or _is_uuid(project):
runs = client.list_runs(project_id=project, execution_order=execution_order)
else:
runs = client.list_runs(project_name=project, execution_order=execution_order)
if not load_nested:
return runs
return list(runs)

treemap: DefaultDict[uuid.UUID, List[schemas.Run]] = collections.defaultdict(list)
results = []
Expand Down Expand Up @@ -560,13 +580,7 @@ def start(self) -> _ExperimentManager:
_examples = itertools.chain([first_example], self.examples)
if self._experiment is None:
try:
project_metadata = self._metadata or {}
git_info = ls_env.get_git_info()
if git_info:
project_metadata = {
**project_metadata,
"git": git_info,
}
project_metadata = self._get_experiment_metadata()
project = self.client.create_project(
self.experiment_name,
reference_dataset_id=first_example.dataset_id,
Expand Down Expand Up @@ -702,6 +716,21 @@ def get_summary_scores(self) -> Dict[str, List[dict]]:

# Private methods.

def _get_experiment_metadata(self):
project_metadata = self._metadata or {}
git_info = ls_env.get_git_info()
if git_info:
project_metadata = {
**project_metadata,
"git": git_info,
}
if self._experiment:
project_metadata = {
**self._experiment.metadata,
**project_metadata,
}
return project_metadata

def _get_experiment(self) -> schemas.TracerSession:
if self._experiment is None:
raise ValueError("Experiment not started yet.")
Expand All @@ -716,15 +745,15 @@ def _end(self) -> None:
# Should always be defined in practice when fetched,
# but the typing permits None
max_modified_at = max(modified_at) if modified_at else None
project_metadata = self._get_experiment_metadata()
project_metadata["dataset_version"] = (
max_modified_at.isoformat() if max_modified_at else None
)

self.client.update_project(
experiment.id,
end_time=datetime.datetime.now(datetime.timezone.utc),
metadata={
"dataset_version": (
max_modified_at.isoformat() if max_modified_at else None
)
},
metadata=project_metadata,
)

def _predict(
Expand Down Expand Up @@ -926,7 +955,14 @@ def _get_run(r: run_trees.RunTree) -> None:
reference_example_id=example.id,
on_end=_get_run,
project_name=experiment_name,
metadata=metadata,
metadata={
**metadata,
"example_version": (
example.modified_at.isoformat()
if example.modified_at
else example.created_at.isoformat()
),
},
client=client,
),
)
Expand Down
4 changes: 3 additions & 1 deletion python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,9 @@ def evaluate_run(
Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
""" # noqa: E501
source_run_id = uuid.uuid4()
metadata = {"target_run_id": run.id, "experiment": run.session_id}
metadata = {"target_run_id": run.id}
if getattr(run, "session_id"):
metadata["experiment"] = run.session_id
result = self.func(
run,
example,
Expand Down
2 changes: 1 addition & 1 deletion python/langsmith/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ class TracerSessionResult(TracerSession):
"""The start time of the last run in the project."""
feedback_stats: Optional[Dict[str, Any]]
"""Feedback stats for the project."""
reference_dataset_ids: Optional[List[UUID]]
reference_dataset_id: Optional[UUID]
"""The reference dataset IDs this project's runs were generated on."""
run_facets: Optional[List[Dict[str, Any]]]
"""Facets for the runs in the project."""
Expand Down
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langsmith"
version = "0.1.33"
version = "0.1.34"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
authors = ["LangChain <[email protected]>"]
license = "MIT"
Expand Down

0 comments on commit 843005f

Please sign in to comment.