diff --git a/js/src/schemas.ts b/js/src/schemas.ts index 9ce47af21..bcf94ee62 100644 --- a/js/src/schemas.ts +++ b/js/src/schemas.ts @@ -32,8 +32,8 @@ export interface TracerSessionResult extends TracerSession { last_run_start_time?: number; // Feedback stats for the session. feedback_stats?: Record; - // The reference dataset IDs this session's runs were generated on. - reference_dataset_ids?: string[]; + // The reference dataset ID this session's runs were generated on. + reference_dataset_id?: string; // Facets for the runs in the session. run_facets?: KVMap[]; } diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index c9563caf4..465e14944 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -77,23 +77,23 @@ def evaluate( r"""Evaluate a target system or function on a given dataset. Args: - target (TARGET_T): The target system or function to evaluate. - data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of - examples, or a generator of examples. - evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run - on each example. Defaults to None. - summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary - evaluators to run on the entire dataset. Defaults to None. - metadata (Optional[dict]): Metadata to attach to the experiment. - Defaults to None. - experiment_prefix (Optional[str]): A prefix to provide for your experiment name. - Defaults to None. - max_concurrency (Optional[int]): The maximum number of concurrent - evaluations to run. Defaults to None. - client (Optional[langsmith.Client]): The LangSmith client to use. - Defaults to None. - blocking (bool): Whether to block until the evaluation is complete. - Defaults to True. + target (TARGET_T): The target system or function to evaluate. + data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of + examples, or a generator of examples. + evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run + on each example. Defaults to None. + summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary + evaluators to run on the entire dataset. Defaults to None. + metadata (Optional[dict]): Metadata to attach to the experiment. + Defaults to None. + experiment_prefix (Optional[str]): A prefix to provide for your experiment name. + Defaults to None. + max_concurrency (Optional[int]): The maximum number of concurrent + evaluations to run. Defaults to None. + client (Optional[langsmith.Client]): The LangSmith client to use. + Defaults to None. + blocking (bool): Whether to block until the evaluation is complete. + Defaults to True. Returns: ExperimentResults: The results of the evaluation. @@ -138,6 +138,9 @@ def evaluate( ... data=dataset_name, ... evaluators=[accuracy], ... summary_evaluators=[precision], + ... metadata={ + ... "my-prompt-version": "abcd-1234", + ... }, ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... @@ -234,7 +237,6 @@ def evaluate( def evaluate_existing( experiment: Union[str, uuid.UUID], /, - data: DATA_T, evaluators: Optional[Sequence[EVALUATOR_T]] = None, summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None, metadata: Optional[dict] = None, @@ -294,13 +296,21 @@ def evaluate_existing( >>> experiment_name = "My Experiment:64e6e91" # Or manually specify >>> results = evaluate_existing( ... experiment_name, - ... data=dataset_name, ... summary_evaluators=[precision], ... ) # doctest: +ELLIPSIS View the evaluation results for experiment:... """ # noqa: E501 client = client or langsmith.Client() + project = _load_experiment(experiment, client) runs = _load_traces(experiment, client, load_nested=load_nested) + data = list( + client.list_examples( + dataset_id=project.reference_dataset_id, + as_of=project.metadata.get("dataset_version"), + ) + ) + runs = sorted(runs, key=lambda r: str(r.reference_example_id)) + data = sorted(data, key=lambda d: str(d.id)) return _evaluate( runs, data=data, @@ -400,6 +410,7 @@ def _evaluate( max_concurrency: Optional[int] = None, client: Optional[langsmith.Client] = None, blocking: bool = True, + experiment: Optional[schemas.TracerSession] = None, ) -> ExperimentResults: # Initialize the experiment manager. manager = _ExperimentManager( @@ -407,6 +418,7 @@ def _evaluate( client=client, metadata=metadata, experiment_prefix=experiment_prefix, + experiment=experiment, # If provided, we don't need to create a new experiment. runs=None if _is_callable(target) else cast(Iterable[schemas.Run], target), # Create or resolve the experiment. @@ -438,9 +450,17 @@ def _is_uuid(value: str) -> bool: return False +def _load_experiment( + project: Union[str, uuid.UUID], client: langsmith.Client +) -> schemas.TracerSession: + if isinstance(project, uuid.UUID) or _is_uuid(project): + return client.read_project(project_id=project) + return client.read_project(project_name=project) + + def _load_traces( project: Union[str, uuid.UUID], client: langsmith.Client, load_nested: bool = False -) -> Iterable[schemas.Run]: +) -> List[schemas.Run]: """Load nested traces for a given project.""" execution_order = None if load_nested else 1 if isinstance(project, uuid.UUID) or _is_uuid(project): @@ -448,7 +468,7 @@ def _load_traces( else: runs = client.list_runs(project_name=project, execution_order=execution_order) if not load_nested: - return runs + return list(runs) treemap: DefaultDict[uuid.UUID, List[schemas.Run]] = collections.defaultdict(list) results = [] @@ -560,13 +580,7 @@ def start(self) -> _ExperimentManager: _examples = itertools.chain([first_example], self.examples) if self._experiment is None: try: - project_metadata = self._metadata or {} - git_info = ls_env.get_git_info() - if git_info: - project_metadata = { - **project_metadata, - "git": git_info, - } + project_metadata = self._get_experiment_metadata() project = self.client.create_project( self.experiment_name, reference_dataset_id=first_example.dataset_id, @@ -702,6 +716,21 @@ def get_summary_scores(self) -> Dict[str, List[dict]]: # Private methods. + def _get_experiment_metadata(self): + project_metadata = self._metadata or {} + git_info = ls_env.get_git_info() + if git_info: + project_metadata = { + **project_metadata, + "git": git_info, + } + if self._experiment: + project_metadata = { + **self._experiment.metadata, + **project_metadata, + } + return project_metadata + def _get_experiment(self) -> schemas.TracerSession: if self._experiment is None: raise ValueError("Experiment not started yet.") @@ -716,15 +745,15 @@ def _end(self) -> None: # Should always be defined in practice when fetched, # but the typing permits None max_modified_at = max(modified_at) if modified_at else None + project_metadata = self._get_experiment_metadata() + project_metadata["dataset_version"] = ( + max_modified_at.isoformat() if max_modified_at else None + ) self.client.update_project( experiment.id, end_time=datetime.datetime.now(datetime.timezone.utc), - metadata={ - "dataset_version": ( - max_modified_at.isoformat() if max_modified_at else None - ) - }, + metadata=project_metadata, ) def _predict( @@ -926,7 +955,14 @@ def _get_run(r: run_trees.RunTree) -> None: reference_example_id=example.id, on_end=_get_run, project_name=experiment_name, - metadata=metadata, + metadata={ + **metadata, + "example_version": ( + example.modified_at.isoformat() + if example.modified_at + else example.created_at.isoformat() + ), + }, client=client, ), ) diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 65e123042..a077121ff 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -161,7 +161,9 @@ def evaluate_run( Union[EvaluationResult, EvaluationResults]: The result of the evaluation. """ # noqa: E501 source_run_id = uuid.uuid4() - metadata = {"target_run_id": run.id, "experiment": run.session_id} + metadata = {"target_run_id": run.id} + if getattr(run, "session_id"): + metadata["experiment"] = run.session_id result = self.func( run, example, diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py index 372894cea..97e5b7cfa 100644 --- a/python/langsmith/schemas.py +++ b/python/langsmith/schemas.py @@ -558,7 +558,7 @@ class TracerSessionResult(TracerSession): """The start time of the last run in the project.""" feedback_stats: Optional[Dict[str, Any]] """Feedback stats for the project.""" - reference_dataset_ids: Optional[List[UUID]] + reference_dataset_id: Optional[UUID] """The reference dataset IDs this project's runs were generated on.""" run_facets: Optional[List[Dict[str, Any]]] """Facets for the runs in the project.""" diff --git a/python/pyproject.toml b/python/pyproject.toml index 81a66fc2f..ccaa30f32 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langsmith" -version = "0.1.33" +version = "0.1.34" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." authors = ["LangChain "] license = "MIT"