diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py index 80ba8db17..8a612b0b9 100644 --- a/python/langsmith/evaluation/_arunner.py +++ b/python/langsmith/evaluation/_arunner.py @@ -858,10 +858,17 @@ async def _aapply_summary_evaluators( summary_evaluators: Sequence[SUMMARY_EVALUATOR_T], ) -> AsyncIterator[EvaluationResults]: runs, examples, evaluation_results = [], [], [] - async for row in self.aget_results(): - runs.append(row["run"]) - examples.append(row["example"]) - evaluation_results.append(row["evaluation_results"]["results"]) + + async_examples = aitertools.ensure_async_iterator(await self.aget_examples()) + async for run, example in aitertools.async_zip( + self.aget_runs(), async_examples + ): + runs.append(run) + examples.append(example) + + async for evaluation_result in self.aget_evaluation_results(): + evaluation_results.append(evaluation_result["results"]) + aggregate_feedback = [] project_id = self._get_experiment().id if self._upload_results else None current_context = rh.get_tracing_context() diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index 5148e99be..dbed77c2a 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -1668,10 +1668,12 @@ def _apply_summary_evaluators( self, summary_evaluators: Sequence[SUMMARY_EVALUATOR_T] ) -> Generator[EvaluationResults, None, None]: runs, examples, evaluation_results = [], [], [] - for row in self.get_results(): - runs.append(row["run"]) - examples.append(row["example"]) - evaluation_results.append(row["evaluation_results"]["results"]) + for run, example in zip(self.runs, self.examples): + runs.append(run) + examples.append(example) + + for evaluation_result in self.evaluation_results: + evaluation_results.append(evaluation_result["results"]) aggregate_feedback = [] with ls_utils.ContextThreadPoolExecutor() as executor: @@ -1791,15 +1793,15 @@ def _wrap(evaluator: SUMMARY_EVALUATOR_T) -> SUMMARY_EVALUATOR_T: @functools.wraps(evaluator) def _wrapper_inner( - runs: list[schemas.Run], - examples: list[schemas.Example], - evaluation_results: list[list[EvaluationResult]], + runs: Sequence[schemas.Run], + examples: Sequence[schemas.Example], + evaluation_results: Sequence[list[EvaluationResult]], ) -> Union[EvaluationResult, EvaluationResults]: @rh.traceable(name=eval_name) def _wrapper_super_inner( runs_: str, examples_: str, evaluation_results_: str ) -> Union[EvaluationResult, EvaluationResults]: - return evaluator(runs, examples, evaluation_results) + return evaluator(list(runs), list(examples), list(evaluation_results)) return _wrapper_super_inner( f"Runs[] (Length={len(runs)})",