Skip to content

Commit

Permalink
Ensure examples are properly sorted (#561)
Browse files Browse the repository at this point in the history
Also:
-  fix the ref dataset ID type of tracer schema
- fix a mistyping issue in the evaluate_run method
  • Loading branch information
hinthornw authored Mar 27, 2024
1 parent f35d651 commit 25808cf
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 26 deletions.
4 changes: 2 additions & 2 deletions js/src/schemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ export interface TracerSessionResult extends TracerSession {
last_run_start_time?: number;
// Feedback stats for the session.
feedback_stats?: Record<string, unknown>;
// The reference dataset IDs this session's runs were generated on.
reference_dataset_ids?: string[];
// The reference dataset ID this session's runs were generated on.
reference_dataset_id?: string;
// Facets for the runs in the session.
run_facets?: KVMap[];
}
Expand Down
68 changes: 46 additions & 22 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,23 +77,23 @@ def evaluate(
r"""Evaluate a target system or function on a given dataset.
Args:
target (TARGET_T): The target system or function to evaluate.
data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of
examples, or a generator of examples.
evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run
on each example. Defaults to None.
summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary
evaluators to run on the entire dataset. Defaults to None.
metadata (Optional[dict]): Metadata to attach to the experiment.
Defaults to None.
experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
Defaults to None.
max_concurrency (Optional[int]): The maximum number of concurrent
evaluations to run. Defaults to None.
client (Optional[langsmith.Client]): The LangSmith client to use.
Defaults to None.
blocking (bool): Whether to block until the evaluation is complete.
Defaults to True.
target (TARGET_T): The target system or function to evaluate.
data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of
examples, or a generator of examples.
evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run
on each example. Defaults to None.
summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary
evaluators to run on the entire dataset. Defaults to None.
metadata (Optional[dict]): Metadata to attach to the experiment.
Defaults to None.
experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
Defaults to None.
max_concurrency (Optional[int]): The maximum number of concurrent
evaluations to run. Defaults to None.
client (Optional[langsmith.Client]): The LangSmith client to use.
Defaults to None.
blocking (bool): Whether to block until the evaluation is complete.
Defaults to True.
Returns:
ExperimentResults: The results of the evaluation.
Expand Down Expand Up @@ -237,7 +237,6 @@ def evaluate(
def evaluate_existing(
experiment: Union[str, uuid.UUID],
/,
data: DATA_T,
evaluators: Optional[Sequence[EVALUATOR_T]] = None,
summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
metadata: Optional[dict] = None,
Expand Down Expand Up @@ -297,13 +296,21 @@ def evaluate_existing(
>>> experiment_name = "My Experiment:64e6e91" # Or manually specify
>>> results = evaluate_existing(
... experiment_name,
... data=dataset_name,
... summary_evaluators=[precision],
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
""" # noqa: E501
client = client or langsmith.Client()
project = _load_experiment(experiment, client)
runs = _load_traces(experiment, client, load_nested=load_nested)
data = list(
client.list_examples(
dataset_id=project.reference_dataset_id,
as_of=project.metadata.get("dataset_version"),
)
)
runs = sorted(runs, key=lambda r: str(r.reference_example_id))
data = sorted(data, key=lambda d: str(d.id))
return _evaluate(
runs,
data=data,
Expand Down Expand Up @@ -403,13 +410,15 @@ def _evaluate(
max_concurrency: Optional[int] = None,
client: Optional[langsmith.Client] = None,
blocking: bool = True,
experiment: Optional[schemas.TracerSession] = None,
) -> ExperimentResults:
# Initialize the experiment manager.
manager = _ExperimentManager(
data,
client=client,
metadata=metadata,
experiment_prefix=experiment_prefix,
experiment=experiment,
# If provided, we don't need to create a new experiment.
runs=None if _is_callable(target) else cast(Iterable[schemas.Run], target),
# Create or resolve the experiment.
Expand Down Expand Up @@ -441,17 +450,25 @@ def _is_uuid(value: str) -> bool:
return False


def _load_experiment(
project: Union[str, uuid.UUID], client: langsmith.Client
) -> schemas.TracerSession:
if isinstance(project, uuid.UUID) or _is_uuid(project):
return client.read_project(project_id=project)
return client.read_project(project_name=project)


def _load_traces(
project: Union[str, uuid.UUID], client: langsmith.Client, load_nested: bool = False
) -> Iterable[schemas.Run]:
) -> List[schemas.Run]:
"""Load nested traces for a given project."""
execution_order = None if load_nested else 1
if isinstance(project, uuid.UUID) or _is_uuid(project):
runs = client.list_runs(project_id=project, execution_order=execution_order)
else:
runs = client.list_runs(project_name=project, execution_order=execution_order)
if not load_nested:
return runs
return list(runs)

treemap: DefaultDict[uuid.UUID, List[schemas.Run]] = collections.defaultdict(list)
results = []
Expand Down Expand Up @@ -938,7 +955,14 @@ def _get_run(r: run_trees.RunTree) -> None:
reference_example_id=example.id,
on_end=_get_run,
project_name=experiment_name,
metadata=metadata,
metadata={
**metadata,
"example_version": (
example.modified_at.isoformat()
if example.modified_at
else example.created_at.isoformat()
),
},
client=client,
),
)
Expand Down
4 changes: 3 additions & 1 deletion python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,9 @@ def evaluate_run(
Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
""" # noqa: E501
source_run_id = uuid.uuid4()
metadata = {"target_run_id": run.id, "experiment": run.session_id}
metadata = {"target_run_id": run.id}
if getattr(run, "session_id"):
metadata["experiment"] = run.session_id
result = self.func(
run,
example,
Expand Down
2 changes: 1 addition & 1 deletion python/langsmith/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ class TracerSessionResult(TracerSession):
"""The start time of the last run in the project."""
feedback_stats: Optional[Dict[str, Any]]
"""Feedback stats for the project."""
reference_dataset_ids: Optional[List[UUID]]
reference_dataset_id: Optional[UUID]
"""The reference dataset IDs this project's runs were generated on."""
run_facets: Optional[List[Dict[str, Any]]]
"""Facets for the runs in the project."""
Expand Down

0 comments on commit 25808cf

Please sign in to comment.