Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issues in ResponseMatching operator #589

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union

from uptrain import CritiqueTone, Evals, GuidelineAdherence, ResponseMatching # type: ignore
from uptrain.framework.evals import ParametricEval
from uptrain.framework.evalllm import ParametricEval


class UpTrainMetric(Enum):
Expand Down Expand Up @@ -57,7 +57,7 @@ class UpTrainMetric(Enum):
GUIDELINE_ADHERENCE = "guideline_adherence"

#: Response matching.\
#: Inputs - `responses: List[str], ground_truths: List[str]`\
#: Inputs - `questions: List[str], responses: List[str], ground_truths: List[str]`\
#: Parameters - `method: str`
RESPONSE_MATCHING = "response_matching"

Expand Down Expand Up @@ -235,13 +235,14 @@ def response(
yield {"response": r}

@staticmethod
def response_ground_truth(
def question_response_ground_truth(
questions: List[str],
responses: List[str],
ground_truths: List[str],
) -> Iterable[Dict[str, str]]:
InputConverters._validate_input_elements(ground_truths=ground_truths, responses=responses)
for r, gt in zip(responses, ground_truths): # type: ignore
yield {"response": r, "ground_truth": gt}
InputConverters._validate_input_elements(questions=questions, ground_truths=ground_truths, responses=responses)
for q, r, gt in zip(questions, responses, ground_truths): # type: ignore
yield {"question": q, "response": r, "ground_truth": gt}


class OutputConverters:
Expand All @@ -267,12 +268,13 @@ def validate_outputs(outputs: List[Dict[str, Any]]):
(
float,
str,
int,
),
)
for x in outputs
for y in x.values()
):
msg = "UpTrain evaluator expects values in the output dicts to be either `str` or `float`"
msg = "UpTrain evaluator expects values in the output dicts to be either `str`, `float` or `int`"

if msg is not None:
raise ValueError(msg)
Expand Down Expand Up @@ -311,7 +313,7 @@ def critique_language(
def critique_tone(
output: Dict[str, Any], metric_params: Optional[Dict[str, Any]] # noqa: ARG004
) -> List[MetricResult]:
return [OutputConverters._extract_default_results(output, "tone")]
return [OutputConverters._extract_default_results(output, "critique_tone")]

@staticmethod
def guideline_adherence(output: Dict[str, Any], metric_params: Optional[Dict[str, Any]]) -> List[MetricResult]:
Expand Down Expand Up @@ -380,7 +382,7 @@ def response_matching(
UpTrainMetric.RESPONSE_MATCHING: MetricDescriptor.new(
UpTrainMetric.RESPONSE_MATCHING,
ResponseMatching,
InputConverters.response_ground_truth, # type: ignore
InputConverters.question_response_ground_truth, # type: ignore
OutputConverters.response_matching,
init_parameters={"method": Optional[str]}, # type: ignore
),
Expand Down
166 changes: 88 additions & 78 deletions integrations/uptrain/tests/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ def log_and_evaluate(self, data, checks, **kwargs):
"explanation_politeness": "11",
},
UpTrainMetric.CRITIQUE_TONE: {
"score_tone": 0.4,
"explanation_tone": "12",
"score_critique_tone": 0.4,
"explanation_critique_tone": "12",
},
UpTrainMetric.GUIDELINE_ADHERENCE: {
"score_guideline_adherence": 1.0,
Expand All @@ -103,13 +103,76 @@ def log_and_evaluate(self, data, checks, **kwargs):
return data


# This integration test validates the evaluator by running it against the
# OpenAI API. It is parameterized by the metric, the inputs to the evalutor
# and the metric parameters.
@pytest.mark.integration
@pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY not set")
@pytest.mark.parametrize(
"metric, inputs, metric_params",
[
(UpTrainMetric.CONTEXT_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS}, None),
(
UpTrainMetric.FACTUAL_ACCURACY,
{"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
None,
),
(UpTrainMetric.RESPONSE_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
(UpTrainMetric.RESPONSE_COMPLETENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
(
UpTrainMetric.RESPONSE_COMPLETENESS_WRT_CONTEXT,
{"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
None,
),
(
UpTrainMetric.RESPONSE_CONSISTENCY,
{"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
None,
),
(UpTrainMetric.RESPONSE_CONCISENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
(UpTrainMetric.CRITIQUE_LANGUAGE, {"responses": DEFAULT_RESPONSES}, None),
(UpTrainMetric.CRITIQUE_TONE, {"responses": DEFAULT_RESPONSES}, {"llm_persona": "idiot"}),
(
UpTrainMetric.GUIDELINE_ADHERENCE,
{"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES},
{"guideline": "Do nothing", "guideline_name": "somename", "response_schema": None},
),
(
UpTrainMetric.RESPONSE_MATCHING,
{
"questions": DEFAULT_QUESTIONS,
"ground_truths": [
"Consumerism is the most popular sport in the world",
"Python language was created by some dude.",
],
"responses": DEFAULT_RESPONSES,
},
{"method": "llm"},
),
],
)
def test_integration_run(metric, inputs, metric_params):
init_params = {
"metric": metric,
"metric_params": metric_params,
"api": "openai",
"api_key": Secret.from_env_var("OPENAI_API_KEY"),
}
eval = UpTrainEvaluator(**init_params)
output = eval.run(**inputs)

assert type(output) == dict
assert len(output) == 1
assert "results" in output
assert len(output["results"]) == len(next(iter(inputs.values())))


def test_evaluator_api(monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
monkeypatch.setenv("UPTRAIN_API_KEY", "test-api-key")

eval = UpTrainEvaluator(UpTrainMetric.RESPONSE_COMPLETENESS)
assert eval.api == "openai"
assert eval.api_key == Secret.from_env_var("OPENAI_API_KEY")
with pytest.raises(ValueError, match="OpenAI API Key is invalid"):
eval = UpTrainEvaluator(UpTrainMetric.RESPONSE_COMPLETENESS)

eval = UpTrainEvaluator(
UpTrainMetric.RESPONSE_COMPLETENESS,
Expand All @@ -127,7 +190,7 @@ def test_evaluator_api(monkeypatch):
with pytest.raises(ValueError, match="None of the following authentication environment variables are set"):
UpTrainEvaluator(UpTrainMetric.CONTEXT_RELEVANCE, api="uptrain", api_key=Secret.from_env_var("asd39920qqq"))

with pytest.raises(ValueError, match="does not support additional parameters"):
with pytest.raises(ValueError, match="OpenAI API Key is invalid"):
UpTrainEvaluator(
UpTrainMetric.CONTEXT_RELEVANCE,
api_params={"project_name": "test"},
Expand All @@ -146,22 +209,26 @@ def test_evaluator_metric_init_params():
eval = UpTrainEvaluator(
UpTrainMetric.CRITIQUE_TONE,
metric_params={"llm_persona": "village idiot"},
api_key=Secret.from_token("Aaa"),
api_key=Secret.from_env_var("OPENAI_API_KEY"),
)
assert eval._backend_metric.llm_persona == "village idiot"

with pytest.raises(ValueError, match="Invalid init parameters"):
UpTrainEvaluator(
UpTrainMetric.CRITIQUE_TONE, metric_params={"role": "village idiot"}, api_key=Secret.from_token("Aaa")
UpTrainMetric.CRITIQUE_TONE,
metric_params={"role": "village idiot"},
api_key=Secret.from_env_var("OPENAI_API_KEY"),
)

with pytest.raises(ValueError, match="unexpected init parameters"):
UpTrainEvaluator(
UpTrainMetric.FACTUAL_ACCURACY, metric_params={"check_numbers": True}, api_key=Secret.from_token("Aaa")
UpTrainMetric.FACTUAL_ACCURACY,
metric_params={"check_numbers": True},
api_key=Secret.from_env_var("OPENAI_API_KEY"),
)

with pytest.raises(ValueError, match="expected init parameters"):
UpTrainEvaluator(UpTrainMetric.RESPONSE_MATCHING, api_key=Secret.from_token("Aaa"))
UpTrainEvaluator(UpTrainMetric.RESPONSE_MATCHING, api_key=Secret.from_env_var("OPENAI_API_KEY"))


@patch("os.environ.get")
Expand Down Expand Up @@ -211,14 +278,14 @@ def test_evaluator_serde(os_environ_get):
{"questions": [], "responses": []},
{"guideline": "Do nothing", "guideline_name": "somename", "response_schema": None},
),
(UpTrainMetric.RESPONSE_MATCHING, {"ground_truths": [], "responses": []}, {"method": "llm"}),
(UpTrainMetric.RESPONSE_MATCHING, {"questions": [], "ground_truths": [], "responses": []}, {"method": "llm"}),
],
)
def test_evaluator_valid_inputs(metric, inputs, params):
init_params = {
"metric": metric,
"metric_params": params,
"api_key": Secret.from_token("Aaa"),
"api_key": Secret.from_env_var("OPENAI_API_KEY"),
"api_params": None,
}
eval = UpTrainEvaluator(**init_params)
Expand All @@ -245,7 +312,7 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params):
init_params = {
"metric": metric,
"metric_params": params,
"api_key": Secret.from_token("Aaa"),
"api_key": Secret.from_env_var("OPENAI_API_KEY"),
"api_params": None,
}
eval = UpTrainEvaluator(**init_params)
Expand Down Expand Up @@ -295,7 +362,12 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params):
],
None,
),
(UpTrainMetric.CRITIQUE_TONE, {"responses": ["r9"]}, [[("tone", 0.4, "12")]], {"llm_persona": "idiot"}),
(
UpTrainMetric.CRITIQUE_TONE,
{"responses": ["r9"]},
[[("critique_tone", 0.4, "12")]],
{"llm_persona": "idiot"},
),
(
UpTrainMetric.GUIDELINE_ADHERENCE,
{"questions": ["q10"], "responses": ["r10"]},
Expand All @@ -304,7 +376,7 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params):
),
(
UpTrainMetric.RESPONSE_MATCHING,
{"ground_truths": ["g11"], "responses": ["r11"]},
{"questions": ["q11"], "ground_truths": ["g11"], "responses": ["r11"]},
[
[
("response_match_precision", 1.0, None),
Expand All @@ -320,7 +392,7 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params):
init_params = {
"metric": metric,
"metric_params": metric_params,
"api_key": Secret.from_token("Aaa"),
"api_key": Secret.from_env_var("OPENAI_API_KEY"),
"api_params": None,
}
eval = UpTrainEvaluator(**init_params)
Expand All @@ -336,65 +408,3 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params):
expected = {(name if name is not None else str(metric), score, exp) for name, score, exp in o}
got = {(x["name"], x["score"], x["explanation"]) for x in r}
assert got == expected


# This integration test validates the evaluator by running it against the
# OpenAI API. It is parameterized by the metric, the inputs to the evalutor
# and the metric parameters.
@pytest.mark.integration
@pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY not set")
@pytest.mark.parametrize(
"metric, inputs, metric_params",
[
(UpTrainMetric.CONTEXT_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS}, None),
(
UpTrainMetric.FACTUAL_ACCURACY,
{"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
None,
),
(UpTrainMetric.RESPONSE_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
(UpTrainMetric.RESPONSE_COMPLETENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
(
UpTrainMetric.RESPONSE_COMPLETENESS_WRT_CONTEXT,
{"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
None,
),
(
UpTrainMetric.RESPONSE_CONSISTENCY,
{"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
None,
),
(UpTrainMetric.RESPONSE_CONCISENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
(UpTrainMetric.CRITIQUE_LANGUAGE, {"responses": DEFAULT_RESPONSES}, None),
(UpTrainMetric.CRITIQUE_TONE, {"responses": DEFAULT_RESPONSES}, {"llm_persona": "idiot"}),
(
UpTrainMetric.GUIDELINE_ADHERENCE,
{"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES},
{"guideline": "Do nothing", "guideline_name": "somename", "response_schema": None},
),
(
UpTrainMetric.RESPONSE_MATCHING,
{
"ground_truths": [
"Consumerism is the most popular sport in the world",
"Python language was created by some dude.",
],
"responses": DEFAULT_RESPONSES,
},
{"method": "llm"},
),
],
)
def test_integration_run(metric, inputs, metric_params):
init_params = {
"metric": metric,
"metric_params": metric_params,
"api": "openai",
}
eval = UpTrainEvaluator(**init_params)
output = eval.run(**inputs)

assert type(output) == dict
assert len(output) == 1
assert "results" in output
assert len(output["results"]) == len(next(iter(inputs.values())))
Loading