From e5d54b3120dc583f2d0ea1b0b45aa4aa2abf1f6f Mon Sep 17 00:00:00 2001 From: Dhruv Chawla Date: Sat, 16 Mar 2024 16:31:18 +0530 Subject: [PATCH 1/7] Fix issues in ResponseMatching operator --- .../components/evaluators/uptrain/metrics.py | 16 +++++++++------- integrations/uptrain/tests/test_evaluator.py | 5 +++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py b/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py index a13843d4a..9ddb60709 100644 --- a/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py +++ b/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py @@ -57,7 +57,7 @@ class UpTrainMetric(Enum): GUIDELINE_ADHERENCE = "guideline_adherence" #: Response matching.\ - #: Inputs - `responses: List[str], ground_truths: List[str]`\ + #: Inputs - `questions: List[str], responses: List[str], ground_truths: List[str]`\ #: Parameters - `method: str` RESPONSE_MATCHING = "response_matching" @@ -235,13 +235,14 @@ def response( yield {"response": r} @staticmethod - def response_ground_truth( + def question_response_ground_truth( + questions: List[str], responses: List[str], ground_truths: List[str], ) -> Iterable[Dict[str, str]]: - InputConverters._validate_input_elements(ground_truths=ground_truths, responses=responses) - for r, gt in zip(responses, ground_truths): # type: ignore - yield {"response": r, "ground_truth": gt} + InputConverters._validate_input_elements(questions=questions, ground_truths=ground_truths, responses=responses) + for q, r, gt in zip(questions, responses, ground_truths): # type: ignore + yield {"question": q, "response": r, "ground_truth": gt} class OutputConverters: @@ -267,12 +268,13 @@ def validate_outputs(outputs: List[Dict[str, Any]]): ( float, str, + int, ), ) for x in outputs for y in x.values() ): - msg = "UpTrain evaluator expects values in the output dicts to be either `str` or `float`" + msg = "UpTrain evaluator expects values in the output dicts to be either `str`, `float` or `int`" if msg is not None: raise ValueError(msg) @@ -380,7 +382,7 @@ def response_matching( UpTrainMetric.RESPONSE_MATCHING: MetricDescriptor.new( UpTrainMetric.RESPONSE_MATCHING, ResponseMatching, - InputConverters.response_ground_truth, # type: ignore + InputConverters.question_response_ground_truth, # type: ignore OutputConverters.response_matching, init_parameters={"method": Optional[str]}, # type: ignore ), diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py index d7566c795..418a006f0 100644 --- a/integrations/uptrain/tests/test_evaluator.py +++ b/integrations/uptrain/tests/test_evaluator.py @@ -211,7 +211,7 @@ def test_evaluator_serde(os_environ_get): {"questions": [], "responses": []}, {"guideline": "Do nothing", "guideline_name": "somename", "response_schema": None}, ), - (UpTrainMetric.RESPONSE_MATCHING, {"ground_truths": [], "responses": []}, {"method": "llm"}), + (UpTrainMetric.RESPONSE_MATCHING, {"questions": [], "ground_truths": [], "responses": []}, {"method": "llm"}), ], ) def test_evaluator_valid_inputs(metric, inputs, params): @@ -304,7 +304,7 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params): ), ( UpTrainMetric.RESPONSE_MATCHING, - {"ground_truths": ["g11"], "responses": ["r11"]}, + {"questions": ["q11"], "ground_truths": ["g11"], "responses": ["r11"]}, [ [ ("response_match_precision", 1.0, None), @@ -375,6 +375,7 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params): ( UpTrainMetric.RESPONSE_MATCHING, { + "questions": DEFAULT_QUESTIONS, "ground_truths": [ "Consumerism is the most popular sport in the world", "Python language was created by some dude.", From ca319e8651f9396e125e412edad06b1b381617bd Mon Sep 17 00:00:00 2001 From: Dhruv Chawla Date: Tue, 19 Mar 2024 12:27:53 +0530 Subject: [PATCH 2/7] Use .from_env_var instead of .from_token --- integrations/uptrain/tests/test_evaluator.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py index 418a006f0..81e7783aa 100644 --- a/integrations/uptrain/tests/test_evaluator.py +++ b/integrations/uptrain/tests/test_evaluator.py @@ -146,22 +146,22 @@ def test_evaluator_metric_init_params(): eval = UpTrainEvaluator( UpTrainMetric.CRITIQUE_TONE, metric_params={"llm_persona": "village idiot"}, - api_key=Secret.from_token("Aaa"), + api_key=Secret.from_env_var("OPENAI_API_KEY"), ) assert eval._backend_metric.llm_persona == "village idiot" with pytest.raises(ValueError, match="Invalid init parameters"): UpTrainEvaluator( - UpTrainMetric.CRITIQUE_TONE, metric_params={"role": "village idiot"}, api_key=Secret.from_token("Aaa") + UpTrainMetric.CRITIQUE_TONE, metric_params={"role": "village idiot"}, api_key=Secret.from_env_var("OPENAI_API_KEY") ) with pytest.raises(ValueError, match="unexpected init parameters"): UpTrainEvaluator( - UpTrainMetric.FACTUAL_ACCURACY, metric_params={"check_numbers": True}, api_key=Secret.from_token("Aaa") + UpTrainMetric.FACTUAL_ACCURACY, metric_params={"check_numbers": True}, api_key=Secret.from_env_var("OPENAI_API_KEY") ) with pytest.raises(ValueError, match="expected init parameters"): - UpTrainEvaluator(UpTrainMetric.RESPONSE_MATCHING, api_key=Secret.from_token("Aaa")) + UpTrainEvaluator(UpTrainMetric.RESPONSE_MATCHING, api_key=Secret.from_env_var("OPENAI_API_KEY")) @patch("os.environ.get") @@ -218,7 +218,7 @@ def test_evaluator_valid_inputs(metric, inputs, params): init_params = { "metric": metric, "metric_params": params, - "api_key": Secret.from_token("Aaa"), + "api_key": Secret.from_env_var("OPENAI_API_KEY"), "api_params": None, } eval = UpTrainEvaluator(**init_params) @@ -245,7 +245,7 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params): init_params = { "metric": metric, "metric_params": params, - "api_key": Secret.from_token("Aaa"), + "api_key": Secret.from_env_var("OPENAI_API_KEY"), "api_params": None, } eval = UpTrainEvaluator(**init_params) @@ -320,7 +320,7 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params): init_params = { "metric": metric, "metric_params": metric_params, - "api_key": Secret.from_token("Aaa"), + "api_key": Secret.from_env_var("OPENAI_API_KEY"), "api_params": None, } eval = UpTrainEvaluator(**init_params) From 9b7d349fd8b2b542a2c99897df4bcc8438842874 Mon Sep 17 00:00:00 2001 From: Dhruv Chawla Date: Tue, 19 Mar 2024 12:39:39 +0530 Subject: [PATCH 3/7] Run black --- integrations/uptrain/tests/test_evaluator.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py index 81e7783aa..e51daf16f 100644 --- a/integrations/uptrain/tests/test_evaluator.py +++ b/integrations/uptrain/tests/test_evaluator.py @@ -152,12 +152,16 @@ def test_evaluator_metric_init_params(): with pytest.raises(ValueError, match="Invalid init parameters"): UpTrainEvaluator( - UpTrainMetric.CRITIQUE_TONE, metric_params={"role": "village idiot"}, api_key=Secret.from_env_var("OPENAI_API_KEY") + UpTrainMetric.CRITIQUE_TONE, + metric_params={"role": "village idiot"}, + api_key=Secret.from_env_var("OPENAI_API_KEY"), ) with pytest.raises(ValueError, match="unexpected init parameters"): UpTrainEvaluator( - UpTrainMetric.FACTUAL_ACCURACY, metric_params={"check_numbers": True}, api_key=Secret.from_env_var("OPENAI_API_KEY") + UpTrainMetric.FACTUAL_ACCURACY, + metric_params={"check_numbers": True}, + api_key=Secret.from_env_var("OPENAI_API_KEY"), ) with pytest.raises(ValueError, match="expected init parameters"): From 22af43269d00f69dbb19ce636a9206b34f3b2aaa Mon Sep 17 00:00:00 2001 From: Dhruv Chawla Date: Fri, 12 Apr 2024 22:34:59 +0530 Subject: [PATCH 4/7] Refactor --- .../components/evaluators/uptrain/metrics.py | 4 ++-- integrations/uptrain/tests/test_evaluator.py | 13 ++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py b/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py index 9ddb60709..284f21682 100644 --- a/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py +++ b/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py @@ -6,7 +6,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union from uptrain import CritiqueTone, Evals, GuidelineAdherence, ResponseMatching # type: ignore -from uptrain.framework.evals import ParametricEval +from uptrain.framework.evalllm import ParametricEval class UpTrainMetric(Enum): @@ -313,7 +313,7 @@ def critique_language( def critique_tone( output: Dict[str, Any], metric_params: Optional[Dict[str, Any]] # noqa: ARG004 ) -> List[MetricResult]: - return [OutputConverters._extract_default_results(output, "tone")] + return [OutputConverters._extract_default_results(output, "critique_tone")] @staticmethod def guideline_adherence(output: Dict[str, Any], metric_params: Optional[Dict[str, Any]]) -> List[MetricResult]: diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py index e51daf16f..e935a69f9 100644 --- a/integrations/uptrain/tests/test_evaluator.py +++ b/integrations/uptrain/tests/test_evaluator.py @@ -82,8 +82,8 @@ def log_and_evaluate(self, data, checks, **kwargs): "explanation_politeness": "11", }, UpTrainMetric.CRITIQUE_TONE: { - "score_tone": 0.4, - "explanation_tone": "12", + "score_critique_tone": 0.4, + "explanation_critique_tone": "12", }, UpTrainMetric.GUIDELINE_ADHERENCE: { "score_guideline_adherence": 1.0, @@ -107,9 +107,8 @@ def test_evaluator_api(monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") monkeypatch.setenv("UPTRAIN_API_KEY", "test-api-key") - eval = UpTrainEvaluator(UpTrainMetric.RESPONSE_COMPLETENESS) - assert eval.api == "openai" - assert eval.api_key == Secret.from_env_var("OPENAI_API_KEY") + with pytest.raises(ValueError, match="OpenAI API Key is invalid"): + eval = UpTrainEvaluator(UpTrainMetric.RESPONSE_COMPLETENESS) eval = UpTrainEvaluator( UpTrainMetric.RESPONSE_COMPLETENESS, @@ -127,7 +126,7 @@ def test_evaluator_api(monkeypatch): with pytest.raises(ValueError, match="None of the following authentication environment variables are set"): UpTrainEvaluator(UpTrainMetric.CONTEXT_RELEVANCE, api="uptrain", api_key=Secret.from_env_var("asd39920qqq")) - with pytest.raises(ValueError, match="does not support additional parameters"): + with pytest.raises(ValueError, match="OpenAI API Key is invalid"): UpTrainEvaluator( UpTrainMetric.CONTEXT_RELEVANCE, api_params={"project_name": "test"}, @@ -299,7 +298,7 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params): ], None, ), - (UpTrainMetric.CRITIQUE_TONE, {"responses": ["r9"]}, [[("tone", 0.4, "12")]], {"llm_persona": "idiot"}), + (UpTrainMetric.CRITIQUE_TONE, {"responses": ["r9"]}, [[("critique_tone", 0.4, "12")]], {"llm_persona": "idiot"}), ( UpTrainMetric.GUIDELINE_ADHERENCE, {"questions": ["q10"], "responses": ["r10"]}, From 3e3fc46f27acfdbdf0ba4dcb9282b53dc678e364 Mon Sep 17 00:00:00 2001 From: Dhruv Chawla Date: Fri, 12 Apr 2024 22:54:12 +0530 Subject: [PATCH 5/7] Reorder tests --- integrations/uptrain/tests/test_evaluator.py | 125 ++++++++++--------- 1 file changed, 63 insertions(+), 62 deletions(-) diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py index e935a69f9..a6aff793d 100644 --- a/integrations/uptrain/tests/test_evaluator.py +++ b/integrations/uptrain/tests/test_evaluator.py @@ -103,6 +103,69 @@ def log_and_evaluate(self, data, checks, **kwargs): return data +# This integration test validates the evaluator by running it against the +# OpenAI API. It is parameterized by the metric, the inputs to the evalutor +# and the metric parameters. +@pytest.mark.integration +@pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY not set") +@pytest.mark.parametrize( + "metric, inputs, metric_params", + [ + (UpTrainMetric.CONTEXT_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS}, None), + ( + UpTrainMetric.FACTUAL_ACCURACY, + {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, + None, + ), + (UpTrainMetric.RESPONSE_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None), + (UpTrainMetric.RESPONSE_COMPLETENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None), + ( + UpTrainMetric.RESPONSE_COMPLETENESS_WRT_CONTEXT, + {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, + None, + ), + ( + UpTrainMetric.RESPONSE_CONSISTENCY, + {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, + None, + ), + (UpTrainMetric.RESPONSE_CONCISENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None), + (UpTrainMetric.CRITIQUE_LANGUAGE, {"responses": DEFAULT_RESPONSES}, None), + (UpTrainMetric.CRITIQUE_TONE, {"responses": DEFAULT_RESPONSES}, {"llm_persona": "idiot"}), + ( + UpTrainMetric.GUIDELINE_ADHERENCE, + {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, + {"guideline": "Do nothing", "guideline_name": "somename", "response_schema": None}, + ), + ( + UpTrainMetric.RESPONSE_MATCHING, + { + "questions": DEFAULT_QUESTIONS, + "ground_truths": [ + "Consumerism is the most popular sport in the world", + "Python language was created by some dude.", + ], + "responses": DEFAULT_RESPONSES, + }, + {"method": "llm"}, + ), + ], +) +def test_integration_run(metric, inputs, metric_params): + init_params = { + "metric": metric, + "metric_params": metric_params, + "api": "openai", + } + eval = UpTrainEvaluator(**init_params) + output = eval.run(**inputs) + + assert type(output) == dict + assert len(output) == 1 + assert "results" in output + assert len(output["results"]) == len(next(iter(inputs.values()))) + + def test_evaluator_api(monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") monkeypatch.setenv("UPTRAIN_API_KEY", "test-api-key") @@ -340,65 +403,3 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params): got = {(x["name"], x["score"], x["explanation"]) for x in r} assert got == expected - -# This integration test validates the evaluator by running it against the -# OpenAI API. It is parameterized by the metric, the inputs to the evalutor -# and the metric parameters. -@pytest.mark.integration -@pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY not set") -@pytest.mark.parametrize( - "metric, inputs, metric_params", - [ - (UpTrainMetric.CONTEXT_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS}, None), - ( - UpTrainMetric.FACTUAL_ACCURACY, - {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, - None, - ), - (UpTrainMetric.RESPONSE_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None), - (UpTrainMetric.RESPONSE_COMPLETENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None), - ( - UpTrainMetric.RESPONSE_COMPLETENESS_WRT_CONTEXT, - {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, - None, - ), - ( - UpTrainMetric.RESPONSE_CONSISTENCY, - {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, - None, - ), - (UpTrainMetric.RESPONSE_CONCISENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None), - (UpTrainMetric.CRITIQUE_LANGUAGE, {"responses": DEFAULT_RESPONSES}, None), - (UpTrainMetric.CRITIQUE_TONE, {"responses": DEFAULT_RESPONSES}, {"llm_persona": "idiot"}), - ( - UpTrainMetric.GUIDELINE_ADHERENCE, - {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, - {"guideline": "Do nothing", "guideline_name": "somename", "response_schema": None}, - ), - ( - UpTrainMetric.RESPONSE_MATCHING, - { - "questions": DEFAULT_QUESTIONS, - "ground_truths": [ - "Consumerism is the most popular sport in the world", - "Python language was created by some dude.", - ], - "responses": DEFAULT_RESPONSES, - }, - {"method": "llm"}, - ), - ], -) -def test_integration_run(metric, inputs, metric_params): - init_params = { - "metric": metric, - "metric_params": metric_params, - "api": "openai", - } - eval = UpTrainEvaluator(**init_params) - output = eval.run(**inputs) - - assert type(output) == dict - assert len(output) == 1 - assert "results" in output - assert len(output["results"]) == len(next(iter(inputs.values()))) From 707cda8f27692a7313c304ae8288ff251c06a1d2 Mon Sep 17 00:00:00 2001 From: Dhruv Chawla Date: Fri, 12 Apr 2024 23:01:52 +0530 Subject: [PATCH 6/7] Run black --- integrations/uptrain/tests/test_evaluator.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py index a6aff793d..4e91c5e30 100644 --- a/integrations/uptrain/tests/test_evaluator.py +++ b/integrations/uptrain/tests/test_evaluator.py @@ -361,7 +361,12 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params): ], None, ), - (UpTrainMetric.CRITIQUE_TONE, {"responses": ["r9"]}, [[("critique_tone", 0.4, "12")]], {"llm_persona": "idiot"}), + ( + UpTrainMetric.CRITIQUE_TONE, + {"responses": ["r9"]}, + [[("critique_tone", 0.4, "12")]], + {"llm_persona": "idiot"}, + ), ( UpTrainMetric.GUIDELINE_ADHERENCE, {"questions": ["q10"], "responses": ["r10"]}, @@ -402,4 +407,3 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params): expected = {(name if name is not None else str(metric), score, exp) for name, score, exp in o} got = {(x["name"], x["score"], x["explanation"]) for x in r} assert got == expected - From f4113f038f58845647f361d009e0acfb60570792 Mon Sep 17 00:00:00 2001 From: Dhruv Chawla Date: Fri, 12 Apr 2024 23:11:39 +0530 Subject: [PATCH 7/7] Add api_key to init_params --- integrations/uptrain/tests/test_evaluator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py index 4e91c5e30..30e804b2f 100644 --- a/integrations/uptrain/tests/test_evaluator.py +++ b/integrations/uptrain/tests/test_evaluator.py @@ -156,6 +156,7 @@ def test_integration_run(metric, inputs, metric_params): "metric": metric, "metric_params": metric_params, "api": "openai", + "api_key": Secret.from_env_var("OPENAI_API_KEY"), } eval = UpTrainEvaluator(**init_params) output = eval.run(**inputs)