deepset-ai · Dominastorm · Mar 16, 2024 · Mar 19, 2024 · Mar 19, 2024 · Mar 19, 2024
@@ -6,7 +6,7 @@
 from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union
 
 from uptrain import CritiqueTone, Evals, GuidelineAdherence, ResponseMatching  # type: ignore
-from uptrain.framework.evals import ParametricEval
+from uptrain.framework.evalllm import ParametricEval
 
 
 class UpTrainMetric(Enum):
@@ -57,7 +57,7 @@ class UpTrainMetric(Enum):
     GUIDELINE_ADHERENCE = "guideline_adherence"
 
     #: Response matching.\
-    #: Inputs - `responses: List[str], ground_truths: List[str]`\
+    #: Inputs - `questions: List[str], responses: List[str], ground_truths: List[str]`\
     #: Parameters - `method: str`
     RESPONSE_MATCHING = "response_matching"
 
@@ -235,13 +235,14 @@ def response(
             yield {"response": r}
 
     @staticmethod
-    def response_ground_truth(
+    def question_response_ground_truth(
+        questions: List[str],
         responses: List[str],
         ground_truths: List[str],
     ) -> Iterable[Dict[str, str]]:
-        InputConverters._validate_input_elements(ground_truths=ground_truths, responses=responses)
-        for r, gt in zip(responses, ground_truths):  # type: ignore
-            yield {"response": r, "ground_truth": gt}
+        InputConverters._validate_input_elements(questions=questions, ground_truths=ground_truths, responses=responses)
+        for q, r, gt in zip(questions, responses, ground_truths):  # type: ignore
+            yield {"question": q, "response": r, "ground_truth": gt}
 
 
 class OutputConverters:
@@ -267,12 +268,13 @@ def validate_outputs(outputs: List[Dict[str, Any]]):
                 (
                     float,
                     str,
+                    int,
                 ),
             )
             for x in outputs
             for y in x.values()
         ):
-            msg = "UpTrain evaluator expects values in the output dicts to be either `str` or `float`"
+            msg = "UpTrain evaluator expects values in the output dicts to be either `str`, `float` or `int`"
 
         if msg is not None:
             raise ValueError(msg)
@@ -311,7 +313,7 @@ def critique_language(
     def critique_tone(
         output: Dict[str, Any], metric_params: Optional[Dict[str, Any]]  # noqa: ARG004
     ) -> List[MetricResult]:
-        return [OutputConverters._extract_default_results(output, "tone")]
+        return [OutputConverters._extract_default_results(output, "critique_tone")]
 
     @staticmethod
     def guideline_adherence(output: Dict[str, Any], metric_params: Optional[Dict[str, Any]]) -> List[MetricResult]:
@@ -380,7 +382,7 @@ def response_matching(
     UpTrainMetric.RESPONSE_MATCHING: MetricDescriptor.new(
         UpTrainMetric.RESPONSE_MATCHING,
         ResponseMatching,
-        InputConverters.response_ground_truth,  # type: ignore
+        InputConverters.question_response_ground_truth,  # type: ignore
         OutputConverters.response_matching,
         init_parameters={"method": Optional[str]},  # type: ignore
     ),

@@ -82,8 +82,8 @@ def log_and_evaluate(self, data, checks, **kwargs):
                 "explanation_politeness": "11",
             },
             UpTrainMetric.CRITIQUE_TONE: {
-                "score_tone": 0.4,
-                "explanation_tone": "12",
+                "score_critique_tone": 0.4,
+                "explanation_critique_tone": "12",
             },
             UpTrainMetric.GUIDELINE_ADHERENCE: {
                 "score_guideline_adherence": 1.0,
@@ -103,13 +103,76 @@ def log_and_evaluate(self, data, checks, **kwargs):
         return data
 
 
+# This integration test validates the evaluator by running it against the
+# OpenAI API. It is parameterized by the metric, the inputs to the evalutor
+# and the metric parameters.
+@pytest.mark.integration
+@pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY not set")
+@pytest.mark.parametrize(
+    "metric, inputs, metric_params",
+    [
+        (UpTrainMetric.CONTEXT_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS}, None),
+        (
+            UpTrainMetric.FACTUAL_ACCURACY,
+            {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
+            None,
+        ),
+        (UpTrainMetric.RESPONSE_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
+        (UpTrainMetric.RESPONSE_COMPLETENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
+        (
+            UpTrainMetric.RESPONSE_COMPLETENESS_WRT_CONTEXT,
+            {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
+            None,
+        ),
+        (
+            UpTrainMetric.RESPONSE_CONSISTENCY,
+            {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
+            None,
+        ),
+        (UpTrainMetric.RESPONSE_CONCISENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
+        (UpTrainMetric.CRITIQUE_LANGUAGE, {"responses": DEFAULT_RESPONSES}, None),
+        (UpTrainMetric.CRITIQUE_TONE, {"responses": DEFAULT_RESPONSES}, {"llm_persona": "idiot"}),
+        (
+            UpTrainMetric.GUIDELINE_ADHERENCE,
+            {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES},
+            {"guideline": "Do nothing", "guideline_name": "somename", "response_schema": None},
+        ),
+        (
+            UpTrainMetric.RESPONSE_MATCHING,
+            {
+                "questions": DEFAULT_QUESTIONS,
+                "ground_truths": [
+                    "Consumerism is the most popular sport in the world",
+                    "Python language was created by some dude.",
+                ],
+                "responses": DEFAULT_RESPONSES,
+            },
+            {"method": "llm"},
+        ),
+    ],
+)
+def test_integration_run(metric, inputs, metric_params):
+    init_params = {
+        "metric": metric,
+        "metric_params": metric_params,
+        "api": "openai",
+        "api_key": Secret.from_env_var("OPENAI_API_KEY"),
+    }
+    eval = UpTrainEvaluator(**init_params)
+    output = eval.run(**inputs)
+
+    assert type(output) == dict
+    assert len(output) == 1
+    assert "results" in output
+    assert len(output["results"]) == len(next(iter(inputs.values())))
+
+
 def test_evaluator_api(monkeypatch):
     monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
     monkeypatch.setenv("UPTRAIN_API_KEY", "test-api-key")
 
-    eval = UpTrainEvaluator(UpTrainMetric.RESPONSE_COMPLETENESS)
-    assert eval.api == "openai"
-    assert eval.api_key == Secret.from_env_var("OPENAI_API_KEY")
+    with pytest.raises(ValueError, match="OpenAI API Key is invalid"):
+        eval = UpTrainEvaluator(UpTrainMetric.RESPONSE_COMPLETENESS)
 
     eval = UpTrainEvaluator(
         UpTrainMetric.RESPONSE_COMPLETENESS,
@@ -127,7 +190,7 @@ def test_evaluator_api(monkeypatch):
     with pytest.raises(ValueError, match="None of the following authentication environment variables are set"):
         UpTrainEvaluator(UpTrainMetric.CONTEXT_RELEVANCE, api="uptrain", api_key=Secret.from_env_var("asd39920qqq"))
 
-    with pytest.raises(ValueError, match="does not support additional parameters"):
+    with pytest.raises(ValueError, match="OpenAI API Key is invalid"):
         UpTrainEvaluator(
             UpTrainMetric.CONTEXT_RELEVANCE,
             api_params={"project_name": "test"},
@@ -146,22 +209,26 @@ def test_evaluator_metric_init_params():
     eval = UpTrainEvaluator(
         UpTrainMetric.CRITIQUE_TONE,
         metric_params={"llm_persona": "village idiot"},
-        api_key=Secret.from_token("Aaa"),
+        api_key=Secret.from_env_var("OPENAI_API_KEY"),
     )
     assert eval._backend_metric.llm_persona == "village idiot"
 
     with pytest.raises(ValueError, match="Invalid init parameters"):
         UpTrainEvaluator(
-            UpTrainMetric.CRITIQUE_TONE, metric_params={"role": "village idiot"}, api_key=Secret.from_token("Aaa")
+            UpTrainMetric.CRITIQUE_TONE,
+            metric_params={"role": "village idiot"},
+            api_key=Secret.from_env_var("OPENAI_API_KEY"),
         )
 
     with pytest.raises(ValueError, match="unexpected init parameters"):
         UpTrainEvaluator(
-            UpTrainMetric.FACTUAL_ACCURACY, metric_params={"check_numbers": True}, api_key=Secret.from_token("Aaa")
+            UpTrainMetric.FACTUAL_ACCURACY,
+            metric_params={"check_numbers": True},
+            api_key=Secret.from_env_var("OPENAI_API_KEY"),
         )
 
     with pytest.raises(ValueError, match="expected init parameters"):
-        UpTrainEvaluator(UpTrainMetric.RESPONSE_MATCHING, api_key=Secret.from_token("Aaa"))
+        UpTrainEvaluator(UpTrainMetric.RESPONSE_MATCHING, api_key=Secret.from_env_var("OPENAI_API_KEY"))
 
 
 @patch("os.environ.get")
@@ -211,14 +278,14 @@ def test_evaluator_serde(os_environ_get):
             {"questions": [], "responses": []},
             {"guideline": "Do nothing", "guideline_name": "somename", "response_schema": None},
         ),
-        (UpTrainMetric.RESPONSE_MATCHING, {"ground_truths": [], "responses": []}, {"method": "llm"}),
+        (UpTrainMetric.RESPONSE_MATCHING, {"questions": [], "ground_truths": [], "responses": []}, {"method": "llm"}),
     ],
 )
 def test_evaluator_valid_inputs(metric, inputs, params):
     init_params = {
         "metric": metric,
         "metric_params": params,
-        "api_key": Secret.from_token("Aaa"),
+        "api_key": Secret.from_env_var("OPENAI_API_KEY"),
         "api_params": None,
     }
     eval = UpTrainEvaluator(**init_params)
@@ -245,7 +312,7 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params):
         init_params = {
             "metric": metric,
             "metric_params": params,
-            "api_key": Secret.from_token("Aaa"),
+            "api_key": Secret.from_env_var("OPENAI_API_KEY"),
             "api_params": None,
         }
         eval = UpTrainEvaluator(**init_params)
@@ -295,7 +362,12 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params):
             ],
             None,
         ),
-        (UpTrainMetric.CRITIQUE_TONE, {"responses": ["r9"]}, [[("tone", 0.4, "12")]], {"llm_persona": "idiot"}),
+        (
+            UpTrainMetric.CRITIQUE_TONE,
+            {"responses": ["r9"]},
+            [[("critique_tone", 0.4, "12")]],
+            {"llm_persona": "idiot"},
+        ),
         (
             UpTrainMetric.GUIDELINE_ADHERENCE,
             {"questions": ["q10"], "responses": ["r10"]},
@@ -304,7 +376,7 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params):
         ),
         (
             UpTrainMetric.RESPONSE_MATCHING,
-            {"ground_truths": ["g11"], "responses": ["r11"]},
+            {"questions": ["q11"], "ground_truths": ["g11"], "responses": ["r11"]},
             [
                 [
                     ("response_match_precision", 1.0, None),
@@ -320,7 +392,7 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params):
     init_params = {
         "metric": metric,
         "metric_params": metric_params,
-        "api_key": Secret.from_token("Aaa"),
+        "api_key": Secret.from_env_var("OPENAI_API_KEY"),
         "api_params": None,
     }
     eval = UpTrainEvaluator(**init_params)
@@ -336,65 +408,3 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params):
         expected = {(name if name is not None else str(metric), score, exp) for name, score, exp in o}
         got = {(x["name"], x["score"], x["explanation"]) for x in r}
         assert got == expected
-
-
-# This integration test validates the evaluator by running it against the
-# OpenAI API. It is parameterized by the metric, the inputs to the evalutor
-# and the metric parameters.
-@pytest.mark.integration
-@pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY not set")
-@pytest.mark.parametrize(
-    "metric, inputs, metric_params",
-    [
-        (UpTrainMetric.CONTEXT_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS}, None),
-        (
-            UpTrainMetric.FACTUAL_ACCURACY,
-            {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
-            None,
-        ),
-        (UpTrainMetric.RESPONSE_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
-        (UpTrainMetric.RESPONSE_COMPLETENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
-        (
-            UpTrainMetric.RESPONSE_COMPLETENESS_WRT_CONTEXT,
-            {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
-            None,
-        ),
-        (
-            UpTrainMetric.RESPONSE_CONSISTENCY,
-            {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
-            None,
-        ),
-        (UpTrainMetric.RESPONSE_CONCISENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
-        (UpTrainMetric.CRITIQUE_LANGUAGE, {"responses": DEFAULT_RESPONSES}, None),
-        (UpTrainMetric.CRITIQUE_TONE, {"responses": DEFAULT_RESPONSES}, {"llm_persona": "idiot"}),
-        (
-            UpTrainMetric.GUIDELINE_ADHERENCE,
-            {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES},
-            {"guideline": "Do nothing", "guideline_name": "somename", "response_schema": None},
-        ),
-        (
-            UpTrainMetric.RESPONSE_MATCHING,
-            {
-                "ground_truths": [
-                    "Consumerism is the most popular sport in the world",
-                    "Python language was created by some dude.",
-                ],
-                "responses": DEFAULT_RESPONSES,
-            },
-            {"method": "llm"},
-        ),
-    ],
-)
-def test_integration_run(metric, inputs, metric_params):
-    init_params = {
-        "metric": metric,
-        "metric_params": metric_params,
-        "api": "openai",
-    }
-    eval = UpTrainEvaluator(**init_params)
-    output = eval.run(**inputs)
-
-    assert type(output) == dict
-    assert len(output) == 1
-    assert "results" in output
-    assert len(output["results"]) == len(next(iter(inputs.values())))