-
Notifications
You must be signed in to change notification settings - Fork 83
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DRAFT] update evaluate to be concurrent #1340
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
"""V2 Evaluation Interface.""" | ||
Check notice on line 1 in python/langsmith/evaluation/_arunner.py GitHub Actions / benchmarkBenchmark results
Check notice on line 1 in python/langsmith/evaluation/_arunner.py GitHub Actions / benchmarkComparison against main
|
||
|
||
from __future__ import annotations | ||
|
||
|
@@ -491,15 +491,24 @@ | |
cache_path = None | ||
with ls_utils.with_optional_cache(cache_path, ignore_hosts=[client.api_url]): | ||
if is_async_target: | ||
manager = await manager.awith_predictions( | ||
cast(ATARGET_T, target), max_concurrency=max_concurrency | ||
) | ||
if evaluators: | ||
manager = await manager.awith_evaluators( | ||
evaluators, max_concurrency=max_concurrency | ||
) | ||
if summary_evaluators: | ||
manager = await manager.awith_summary_evaluators(summary_evaluators) | ||
if evaluators: | ||
# Run predictions and evaluations in a single pipeline | ||
manager = await manager.awith_predictions_and_evaluators( | ||
cast(ATARGET_T, target), evaluators, max_concurrency=max_concurrency | ||
) | ||
else: | ||
manager = await manager.awith_predictions( | ||
cast(ATARGET_T, target), max_concurrency=max_concurrency | ||
) | ||
if summary_evaluators: | ||
manager = await manager.awith_summary_evaluators(summary_evaluators) | ||
else: | ||
if evaluators: | ||
manager = await manager.awith_evaluators( | ||
evaluators, max_concurrency=max_concurrency | ||
) | ||
if summary_evaluators: | ||
manager = await manager.awith_summary_evaluators(summary_evaluators) | ||
results = AsyncExperimentResults(manager) | ||
if blocking: | ||
await results.wait() | ||
|
@@ -642,6 +651,56 @@ | |
upload_results=self._upload_results, | ||
) | ||
|
||
async def awith_predictions_and_evaluators( | ||
self, | ||
target: ATARGET_T, | ||
evaluators: Sequence[Union[EVALUATOR_T, AEVALUATOR_T]], | ||
/, | ||
max_concurrency: Optional[int] = None, | ||
) -> _AsyncExperimentManager: | ||
"""Run predictions and evaluations in a single pipeline. | ||
|
||
This allows evaluators to process results as soon as they're available from | ||
the target function, rather than waiting for all predictions to complete first. | ||
""" | ||
evaluators = _resolve_evaluators(evaluators) | ||
|
||
if not hasattr(self, '_evaluator_executor'): | ||
self._evaluator_executor = cf.ThreadPoolExecutor(max_workers=4) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't like doing this, but I couldn't figure out another way to pass an executor that doesn't trigger a |
||
async def process_examples(): | ||
async for pred in self._apredict( | ||
target, | ||
max_concurrency=max_concurrency, | ||
include_attachments=_include_attachments(target), | ||
): | ||
example, run = pred["example"], pred["run"] | ||
result = self._arun_evaluators( | ||
evaluators, | ||
{"run": run, "example": example, "evaluation_results": {"results": []}}, | ||
executor=self._evaluator_executor, | ||
) | ||
yield result | ||
|
||
experiment_results = aitertools.aiter_with_concurrency( | ||
max_concurrency, | ||
process_examples(), | ||
_eager_consumption_timeout=0.001, | ||
) | ||
|
||
r1, r2, r3 = aitertools.atee(experiment_results, 3, lock=asyncio.Lock()) | ||
|
||
return _AsyncExperimentManager( | ||
(result["example"] async for result in r1), | ||
experiment=self._experiment, | ||
metadata=self._metadata, | ||
client=self.client, | ||
runs=(result["run"] async for result in r2), | ||
evaluation_results=(result["evaluation_results"] async for result in r3), | ||
summary_results=self._summary_results, | ||
include_attachments=self._include_attachments, | ||
upload_results=self._upload_results, | ||
) | ||
|
||
async def awith_predictions( | ||
self, | ||
target: ATARGET_T, | ||
|
@@ -796,15 +855,17 @@ | |
run = current_results["run"] | ||
example = current_results["example"] | ||
eval_results = current_results["evaluation_results"] | ||
for evaluator in evaluators: | ||
lock = asyncio.Lock() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it's necessary, can remove if wanted There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if not necessary remove -- is this used only for .extend()? (I believe .extend is atomic) https://docs.python.org/3/faq/library.html#what-kinds-of-global-value-mutation-are-thread-safe |
||
async def _run_single_evaluator(evaluator): | ||
try: | ||
evaluator_response = await evaluator.aevaluate_run( | ||
run=run, | ||
example=example, | ||
) | ||
eval_results["results"].extend( | ||
self.client._select_eval_results(evaluator_response) | ||
) | ||
selected_results = self.client._select_eval_results(evaluator_response) | ||
async with lock: | ||
eval_results["results"].extend(selected_results) | ||
|
||
if self._upload_results: | ||
self.client._log_evaluation_feedback( | ||
evaluator_response, run=run, _executor=executor | ||
|
@@ -824,9 +885,9 @@ | |
for key in feedback_keys | ||
] | ||
) | ||
eval_results["results"].extend( | ||
self.client._select_eval_results(error_response) | ||
) | ||
selected_results = self.client._select_eval_results(error_response) | ||
async with lock: | ||
eval_results["results"].extend(selected_results) | ||
if self._upload_results: | ||
self.client._log_evaluation_feedback( | ||
error_response, run=run, _executor=executor | ||
|
@@ -839,15 +900,8 @@ | |
f" run {run.id}: {repr(e)}", | ||
exc_info=True, | ||
) | ||
logger.error( | ||
f"Error running evaluator {repr(evaluator)} on" | ||
f" run {run.id}: {repr(e)}", | ||
exc_info=True, | ||
) | ||
if example.attachments is not None: | ||
for attachment in example.attachments: | ||
reader = example.attachments[attachment]["reader"] | ||
reader.seek(0) | ||
Comment on lines
-847
to
-850
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This no longer works because the evaluators run in parallel. I do not know the ideal solution to this, am open to any and all ideas. |
||
|
||
await asyncio.gather(*[_run_single_evaluator(evaluator) for evaluator in evaluators]) | ||
return ExperimentResultRow( | ||
run=run, | ||
example=example, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
summary evaluators are still evaluated after all the predictions and evaluations have been made, can change in the future but I think much less of a bottle neck.