Skip to content

Commit

Permalink
fmt
Browse files Browse the repository at this point in the history
  • Loading branch information
baskaryan committed Dec 24, 2024
1 parent 8e8c31f commit 197c037
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 78 deletions.
60 changes: 0 additions & 60 deletions python/tests/integration_tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from langsmith.schemas import (
AttachmentsOperations,
DataType,
EvaluationResult,
Example,
ExampleUpdateWithAttachments,
ExampleUploadWithAttachments,
Expand Down Expand Up @@ -1255,65 +1254,6 @@ def test_list_examples_attachments_keys(langchain_client: Client) -> None:
langchain_client.delete_dataset(dataset_id=dataset.id)


async def test_summary_evaluation_with_evaluator_results(
langchain_client: Client,
) -> None:
"""Test summary evaluators receive evaluator results."""
dataset_name = "__test_summary_evaluation_inline_eval" + uuid4().hex[:4]
dataset = langchain_client.create_dataset(
dataset_name,
description="Test dataset for evals with attachments",
data_type=DataType.kv,
)

example_id = uuid4()
langchain_client.create_example(
dataset_id=dataset.id,
inputs={"question": "What is 2+2?"},
outputs={"answer": "4"},
example_id=example_id,
)

def target(inputs: Dict[str, Any]) -> Dict[str, Any]:
return {"answer": "4"}

async def target_async(inputs: Dict[str, Any]) -> Dict[str, Any]:
return {"answer": "4"}

def evaluator(outputs: dict, reference_outputs: dict) -> dict:
return {"score": 1, "key": "foo"}

def summary_evaluator(evaluation_results: list[EvaluationResult]) -> bool:
assert len(evaluation_results) == 1
assert evaluation_results[0][0].key == "foo"
assert evaluation_results[0][0].score == 1
return True

results = langchain_client.evaluate(
target,
data=dataset_name,
evaluators=[evaluator],
summary_evaluators=[summary_evaluator],
num_repetitions=1,
)
assert len(results._summary_results["results"]) == 1
assert results._summary_results["results"][0].score == 1
assert results._summary_results["results"][0].key == "summary_evaluator"

results = await langchain_client.aevaluate(
target_async,
data=dataset_name,
evaluators=[evaluator],
summary_evaluators=[summary_evaluator],
num_repetitions=1,
)
assert len(results._summary_results["results"]) == 1
assert results._summary_results["results"][0].score == 1
assert results._summary_results["results"][0].key == "summary_evaluator"

langchain_client.delete_dataset(dataset_id=dataset.id)


def test_evaluate_with_attachments_multiple_evaluators(
langchain_client: Client,
) -> None:
Expand Down
45 changes: 27 additions & 18 deletions python/tests/unit_tests/evaluation/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,15 +265,6 @@ def eval_list(run, example):
{"score": 1, "key": "list_eval_int"},
]

def summary_eval_runs_examples(runs_, examples_):
return {"score": len(runs_[0].dotted_order)}

def summary_eval_inputs_outputs(inputs, outputs):
return [{"score": len([x["in"] for x in inputs])}]

def summary_eval_outputs_reference(outputs, reference_outputs):
return len([x["answer"] for x in reference_outputs])

evaluators = [
score_value_first,
score_unpacked_inputs_outputs,
Expand All @@ -285,10 +276,23 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
eval_list,
]

def summary_eval_runs_examples(runs_, examples_):
return {"score": len(runs_[0].dotted_order)}

def summary_eval_inputs_outputs(inputs, outputs):
return [{"score": len([x["in"] for x in inputs])}]

def summary_eval_outputs_reference(outputs, reference_outputs):
return len([x["answer"] for x in reference_outputs])

def summary_eval_evaluation_results(evaluation_results):
return all(len(r) == len(evaluators) + 1 for r in evaluation_results)

summary_evaluators = [
summary_eval_runs_examples,
summary_eval_inputs_outputs,
summary_eval_outputs_reference,
summary_eval_evaluation_results,
]

results = evaluate(
Expand All @@ -302,6 +306,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
upload_results=upload_results,
max_concurrency=None,
)

if not blocking:
deltas = []
last = None
Expand Down Expand Up @@ -557,15 +562,6 @@ async def eval_list(run, example):
{"score": 1, "key": "list_eval_int"},
]

def summary_eval_runs_examples(runs_, examples_):
return {"score": len(runs_[0].dotted_order)}

def summary_eval_inputs_outputs(inputs, outputs):
return {"score": len([x["in"] for x in inputs])}

def summary_eval_outputs_reference(outputs, reference_outputs):
return {"score": len([x["answer"] for x in reference_outputs])}

evaluators = [
score_value_first,
score_unpacked_inputs_outputs,
Expand All @@ -577,10 +573,23 @@ def summary_eval_outputs_reference(outputs, reference_outputs):
eval_list,
]

def summary_eval_runs_examples(runs_, examples_):
return {"score": len(runs_[0].dotted_order)}

def summary_eval_inputs_outputs(inputs, outputs):
return {"score": len([x["in"] for x in inputs])}

def summary_eval_outputs_reference(outputs, reference_outputs):
return {"score": len([x["answer"] for x in reference_outputs])}

def summary_eval_evaluation_results(evaluation_results):
return all(len(r) == len(evaluators) + 1 for r in evaluation_results)

summary_evaluators = [
summary_eval_runs_examples,
summary_eval_inputs_outputs,
summary_eval_outputs_reference,
summary_eval_evaluation_results,
]

results = await aevaluate(
Expand Down

0 comments on commit 197c037

Please sign in to comment.