Skip to content

Commit

Permalink
Compute test metrics (#534)
Browse files Browse the repository at this point in the history
Usage:

```
from langsmith.beta import compute_test_metrics

def my_evaluator(run, example):
     score = "foo" in run.outputs['output']
    return {"key": "is_foo", "score": score}

compute_test_metrics("fun-mirror-32", [my_evaluator])
```
  • Loading branch information
hinthornw authored Mar 19, 2024
1 parent 7f883bb commit 8d01ad4
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 3 deletions.
4 changes: 2 additions & 2 deletions js/src/tests/client.int.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ async function waitUntilRunFound(
return false;
}
},
180_000,
210_000,
5_000
);
}
Expand Down Expand Up @@ -292,7 +292,7 @@ test.concurrent(
expect(run2.outputs).toBeDefined();
expect(Object.keys(run2.outputs ?? {})).toHaveLength(0);
},
180_000
240_000
);

test.concurrent(
Expand Down
70 changes: 69 additions & 1 deletion python/langsmith/beta/_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
These functions may change in the future.
"""

import collections
import concurrent.futures
import datetime
import itertools
import uuid
from typing import List, Optional, Sequence
from typing import DefaultDict, List, Optional, Sequence, Tuple, TypeVar

import langsmith.beta._utils as beta_utils
import langsmith.schemas as ls_schemas
from langsmith import evaluation as ls_eval
from langsmith.client import Client


Expand Down Expand Up @@ -165,3 +169,67 @@ def convert_runs_to_test(
project.id, end_time=datetime.datetime.now(tz=datetime.timezone.utc)
)
return project


def _load_nested_traces(project_name: str, client: Client) -> List[ls_schemas.Run]:
runs = client.list_runs(project_name=project_name)
treemap: DefaultDict[uuid.UUID, List[ls_schemas.Run]] = collections.defaultdict(
list
)
results = []
all_runs = {}
for run in runs:
if run.parent_run_id is not None:
treemap[run.parent_run_id].append(run)
else:
results.append(run)
all_runs[run.id] = run
for run_id, child_runs in treemap.items():
all_runs[run_id].child_runs = sorted(child_runs, key=lambda r: r.dotted_order)
return results


T = TypeVar("T")
U = TypeVar("U")


def _outer_product(list1: List[T], list2: List[U]) -> List[Tuple[T, U]]:
return list(itertools.product(list1, list2))


def compute_test_metrics(
project_name: str,
*,
evaluators: list,
max_concurrency: Optional[int] = 10,
client: Optional[Client] = None,
) -> None:
"""Compute test metrics for a given test name using a list of evaluators.
Args:
project_name (str): The name of the test project to evaluate.
evaluators (list): A list of evaluators to compute metrics with.
max_concurrency (Optional[int], optional): The maximum number of concurrent
evaluations. Defaults to 10.
client (Optional[Client], optional): The client to use for evaluations.
Defaults to None.
Returns:
None: This function does not return any value.
"""
evaluators_: List[ls_eval.RunEvaluator] = []
for func in evaluators:
if isinstance(func, ls_eval.RunEvaluator):
evaluators_.append(func)
elif callable(func):
evaluators_.append(ls_eval.run_evaluator(func))
else:
raise NotImplementedError(
f"Evaluation not yet implemented for evaluator of type {type(func)}"
)
client = client or Client()
traces = _load_nested_traces(project_name, client)

# Evaluate
with concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrency) as executor:
executor.map(client.evaluate_run, zip(*_outer_product(traces, evaluators_)))

0 comments on commit 8d01ad4

Please sign in to comment.