Skip to content

Commit

Permalink
[Python:fix] In aevaluate, ensure thread to submit feedback remains o…
Browse files Browse the repository at this point in the history
…pen (#995)
  • Loading branch information
hinthornw authored Sep 11, 2024
1 parent 4561fbf commit 842e12a
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 13 deletions.
9 changes: 9 additions & 0 deletions python/langsmith/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,19 @@ def __getattr__(name: str) -> Any:
from langsmith.evaluation import evaluate

return evaluate

elif name == "evaluate_existing":
from langsmith.evaluation import evaluate_existing

return evaluate_existing
elif name == "aevaluate":
from langsmith.evaluation import aevaluate

return aevaluate
elif name == "aevaluate_existing":
from langsmith.evaluation import aevaluate_existing

return aevaluate_existing
elif name == "tracing_context":
from langsmith.run_helpers import tracing_context

Expand Down
20 changes: 10 additions & 10 deletions python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,7 @@ async def aevaluate(
Examples:
>>> from typing import Sequence
>>> from langsmith import Client
>>> from langsmith.evaluation import evaluate
>>> from langsmith import Client, aevaluate
>>> from langsmith.schemas import Example, Run
>>> client = Client()
>>> dataset = client.clone_public_dataset(
Expand Down Expand Up @@ -206,7 +205,7 @@ async def aevaluate(
>>> async def helpfulness(run: Run, example: Example):
... # Row-level evaluator for helpfulness.
... await asyncio.sleep(0.1) # Replace with your LLM API call
... await asyncio.sleep(5) # Replace with your LLM API call
... return {"score": run.outputs["output"] == "Yes"}
>>> results = asyncio.run(
Expand Down Expand Up @@ -286,7 +285,7 @@ async def aevaluate_existing(
Load the experiment and run the evaluation.
>>> from langsmith.evaluation import aevaluate, aevaluate_existing
>>> from langsmith import aevaluate, aevaluate_existing
>>> dataset_name = "Evaluate Examples"
>>> async def apredict(inputs: dict) -> dict:
... # This can be any async function or just an API call to your app.
Expand Down Expand Up @@ -603,18 +602,19 @@ async def _ascore(
evaluators: Sequence[RunEvaluator],
max_concurrency: Optional[int] = None,
) -> AsyncIterator[ExperimentResultRow]:
async def score_all():
with cf.ThreadPoolExecutor(max_workers=4) as executor:
with cf.ThreadPoolExecutor(max_workers=4) as executor:

async def score_all():
async for current_results in self.aget_results():
# Yield the coroutine to be awaited later in aiter_with_concurrency
yield self._arun_evaluators(
evaluators, current_results, executor=executor
)

async for result in aitertools.aiter_with_concurrency(
max_concurrency, score_all(), _eager_consumption_timeout=0.001
):
yield result
async for result in aitertools.aiter_with_concurrency(
max_concurrency, score_all(), _eager_consumption_timeout=0.001
):
yield result

async def _arun_evaluators(
self,
Expand Down
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langsmith"
version = "0.1.117"
version = "0.1.18"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
authors = ["LangChain <[email protected]>"]
license = "MIT"
Expand Down
53 changes: 51 additions & 2 deletions python/tests/evaluation/test_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,36 @@
import asyncio
from typing import Sequence
import time
from typing import Callable, Sequence, Tuple, TypeVar

import pytest

from langsmith import Client, aevaluate, evaluate, expect, test
from langsmith.schemas import Example, Run

T = TypeVar("T")


def wait_for(
condition: Callable[[], Tuple[T, bool]],
max_sleep_time: int = 120,
sleep_time: int = 3,
) -> T:
"""Wait for a condition to be true."""
start_time = time.time()
last_e = None
while time.time() - start_time < max_sleep_time:
try:
res, cond = condition()
if cond:
return res
except Exception as e:
last_e = e
time.sleep(sleep_time)
total_time = time.time() - start_time
if last_e is not None:
raise last_e
raise ValueError(f"Callable did not return within {total_time}")


def test_evaluate():
client = Client()
Expand Down Expand Up @@ -59,6 +84,12 @@ def accuracy(run: Run, example: Example):
expected = example.outputs["answer"] # type: ignore
return {"score": expected.lower() == pred.lower()}

async def slow_accuracy(run: Run, example: Example):
pred = run.outputs["output"] # type: ignore
expected = example.outputs["answer"] # type: ignore
await asyncio.sleep(5)
return {"score": expected.lower() == pred.lower()}

def precision(runs: Sequence[Run], examples: Sequence[Example]):
predictions = [run.outputs["output"].lower() for run in runs] # type: ignore
expected = [example.outputs["answer"].lower() for example in examples] # type: ignore
Expand All @@ -73,7 +104,7 @@ async def apredict(inputs: dict) -> dict:
results = await aevaluate(
apredict,
data=dataset_name,
evaluators=[accuracy],
evaluators=[accuracy, slow_accuracy],
summary_evaluators=[precision],
experiment_prefix="My Experiment",
description="My Experiment Description",
Expand All @@ -86,12 +117,30 @@ async def apredict(inputs: dict) -> dict:
assert len(results) == 20
examples = client.list_examples(dataset_name=dataset_name)
all_results = [r async for r in results]
all_examples = []
for example in examples:
count = 0
for r in all_results:
if r["run"].reference_example_id == example.id:
count += 1
assert count == 2
all_examples.append(example)

# Wait for there to be 2x runs vs. examples
def check_run_count():
current_runs = list(
client.list_runs(project_name=results.experiment_name, is_root=True)
)
for r in current_runs:
assert "accuracy" in r.feedback_stats
assert "slow_accuracy" in r.feedback_stats
return current_runs, len(current_runs) == 2 * len(all_examples)

final_runs = wait_for(check_run_count, max_sleep_time=60, sleep_time=2)

assert len(final_runs) == 2 * len(
all_examples
), f"Expected {2 * len(all_examples)} runs, but got {len(final_runs)}"


@test
Expand Down

0 comments on commit 842e12a

Please sign in to comment.