diff --git a/js/src/schemas.ts b/js/src/schemas.ts index 9ce47af21..bcf94ee62 100644 --- a/js/src/schemas.ts +++ b/js/src/schemas.ts @@ -32,8 +32,8 @@ export interface TracerSessionResult extends TracerSession { last_run_start_time?: number; // Feedback stats for the session. feedback_stats?: Record; - // The reference dataset IDs this session's runs were generated on. - reference_dataset_ids?: string[]; + // The reference dataset ID this session's runs were generated on. + reference_dataset_id?: string; // Facets for the runs in the session. run_facets?: KVMap[]; } diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index 207b10bf6..465e14944 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -77,23 +77,23 @@ def evaluate( r"""Evaluate a target system or function on a given dataset. Args: - target (TARGET_T): The target system or function to evaluate. - data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of - examples, or a generator of examples. - evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run - on each example. Defaults to None. - summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary - evaluators to run on the entire dataset. Defaults to None. - metadata (Optional[dict]): Metadata to attach to the experiment. - Defaults to None. - experiment_prefix (Optional[str]): A prefix to provide for your experiment name. - Defaults to None. - max_concurrency (Optional[int]): The maximum number of concurrent - evaluations to run. Defaults to None. - client (Optional[langsmith.Client]): The LangSmith client to use. - Defaults to None. - blocking (bool): Whether to block until the evaluation is complete. - Defaults to True. + target (TARGET_T): The target system or function to evaluate. + data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of + examples, or a generator of examples. + evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run + on each example. Defaults to None. + summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary + evaluators to run on the entire dataset. Defaults to None. + metadata (Optional[dict]): Metadata to attach to the experiment. + Defaults to None. + experiment_prefix (Optional[str]): A prefix to provide for your experiment name. + Defaults to None. + max_concurrency (Optional[int]): The maximum number of concurrent + evaluations to run. Defaults to None. + client (Optional[langsmith.Client]): The LangSmith client to use. + Defaults to None. + blocking (bool): Whether to block until the evaluation is complete. + Defaults to True. Returns: ExperimentResults: The results of the evaluation. @@ -237,7 +237,6 @@ def evaluate( def evaluate_existing( experiment: Union[str, uuid.UUID], /, - data: DATA_T, evaluators: Optional[Sequence[EVALUATOR_T]] = None, summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None, metadata: Optional[dict] = None, @@ -297,13 +296,21 @@ def evaluate_existing( >>> experiment_name = "My Experiment:64e6e91" # Or manually specify >>> results = evaluate_existing( ... experiment_name, - ... data=dataset_name, ... summary_evaluators=[precision], ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... """ # noqa: E501 client = client or langsmith.Client() + project = _load_experiment(experiment, client) runs = _load_traces(experiment, client, load_nested=load_nested) + data = list( + client.list_examples( + dataset_id=project.reference_dataset_id, + as_of=project.metadata.get("dataset_version"), + ) + ) + runs = sorted(runs, key=lambda r: str(r.reference_example_id)) + data = sorted(data, key=lambda d: str(d.id)) return _evaluate( runs, data=data, @@ -403,6 +410,7 @@ def _evaluate( max_concurrency: Optional[int] = None, client: Optional[langsmith.Client] = None, blocking: bool = True, + experiment: Optional[schemas.TracerSession] = None, ) -> ExperimentResults: # Initialize the experiment manager. manager = _ExperimentManager( @@ -410,6 +418,7 @@ def _evaluate( client=client, metadata=metadata, experiment_prefix=experiment_prefix, + experiment=experiment, # If provided, we don't need to create a new experiment. runs=None if _is_callable(target) else cast(Iterable[schemas.Run], target), # Create or resolve the experiment. @@ -441,9 +450,17 @@ def _is_uuid(value: str) -> bool: return False +def _load_experiment( + project: Union[str, uuid.UUID], client: langsmith.Client +) -> schemas.TracerSession: + if isinstance(project, uuid.UUID) or _is_uuid(project): + return client.read_project(project_id=project) + return client.read_project(project_name=project) + + def _load_traces( project: Union[str, uuid.UUID], client: langsmith.Client, load_nested: bool = False -) -> Iterable[schemas.Run]: +) -> List[schemas.Run]: """Load nested traces for a given project.""" execution_order = None if load_nested else 1 if isinstance(project, uuid.UUID) or _is_uuid(project): @@ -451,7 +468,7 @@ def _load_traces( else: runs = client.list_runs(project_name=project, execution_order=execution_order) if not load_nested: - return runs + return list(runs) treemap: DefaultDict[uuid.UUID, List[schemas.Run]] = collections.defaultdict(list) results = [] @@ -938,7 +955,14 @@ def _get_run(r: run_trees.RunTree) -> None: reference_example_id=example.id, on_end=_get_run, project_name=experiment_name, - metadata=metadata, + metadata={ + **metadata, + "example_version": ( + example.modified_at.isoformat() + if example.modified_at + else example.created_at.isoformat() + ), + }, client=client, ), ) diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 65e123042..a077121ff 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -161,7 +161,9 @@ def evaluate_run( Union[EvaluationResult, EvaluationResults]: The result of the evaluation. """ # noqa: E501 source_run_id = uuid.uuid4() - metadata = {"target_run_id": run.id, "experiment": run.session_id} + metadata = {"target_run_id": run.id} + if getattr(run, "session_id"): + metadata["experiment"] = run.session_id result = self.func( run, example, diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py index 372894cea..97e5b7cfa 100644 --- a/python/langsmith/schemas.py +++ b/python/langsmith/schemas.py @@ -558,7 +558,7 @@ class TracerSessionResult(TracerSession): """The start time of the last run in the project.""" feedback_stats: Optional[Dict[str, Any]] """Feedback stats for the project.""" - reference_dataset_ids: Optional[List[UUID]] + reference_dataset_id: Optional[UUID] """The reference dataset IDs this project's runs were generated on.""" run_facets: Optional[List[Dict[str, Any]]] """Facets for the runs in the project."""