From e5d54b3120dc583f2d0ea1b0b45aa4aa2abf1f6f Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhruv@uptrain.ai>
Date: Sat, 16 Mar 2024 16:31:18 +0530
Subject: [PATCH 1/7] Fix issues in ResponseMatching operator

---
 .../components/evaluators/uptrain/metrics.py     | 16 +++++++++-------
 integrations/uptrain/tests/test_evaluator.py     |  5 +++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py b/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py
index a13843d4a..9ddb60709 100644
--- a/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py
+++ b/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py
@@ -57,7 +57,7 @@ class UpTrainMetric(Enum):
     GUIDELINE_ADHERENCE = "guideline_adherence"
 
     #: Response matching.\
-    #: Inputs - `responses: List[str], ground_truths: List[str]`\
+    #: Inputs - `questions: List[str], responses: List[str], ground_truths: List[str]`\
     #: Parameters - `method: str`
     RESPONSE_MATCHING = "response_matching"
 
@@ -235,13 +235,14 @@ def response(
             yield {"response": r}
 
     @staticmethod
-    def response_ground_truth(
+    def question_response_ground_truth(
+        questions: List[str],
         responses: List[str],
         ground_truths: List[str],
     ) -> Iterable[Dict[str, str]]:
-        InputConverters._validate_input_elements(ground_truths=ground_truths, responses=responses)
-        for r, gt in zip(responses, ground_truths):  # type: ignore
-            yield {"response": r, "ground_truth": gt}
+        InputConverters._validate_input_elements(questions=questions, ground_truths=ground_truths, responses=responses)
+        for q, r, gt in zip(questions, responses, ground_truths):  # type: ignore
+            yield {"question": q, "response": r, "ground_truth": gt}
 
 
 class OutputConverters:
@@ -267,12 +268,13 @@ def validate_outputs(outputs: List[Dict[str, Any]]):
                 (
                     float,
                     str,
+                    int,
                 ),
             )
             for x in outputs
             for y in x.values()
         ):
-            msg = "UpTrain evaluator expects values in the output dicts to be either `str` or `float`"
+            msg = "UpTrain evaluator expects values in the output dicts to be either `str`, `float` or `int`"
 
         if msg is not None:
             raise ValueError(msg)
@@ -380,7 +382,7 @@ def response_matching(
     UpTrainMetric.RESPONSE_MATCHING: MetricDescriptor.new(
         UpTrainMetric.RESPONSE_MATCHING,
         ResponseMatching,
-        InputConverters.response_ground_truth,  # type: ignore
+        InputConverters.question_response_ground_truth,  # type: ignore
         OutputConverters.response_matching,
         init_parameters={"method": Optional[str]},  # type: ignore
     ),
diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py
index d7566c795..418a006f0 100644
--- a/integrations/uptrain/tests/test_evaluator.py
+++ b/integrations/uptrain/tests/test_evaluator.py
@@ -211,7 +211,7 @@ def test_evaluator_serde(os_environ_get):
             {"questions": [], "responses": []},
             {"guideline": "Do nothing", "guideline_name": "somename", "response_schema": None},
         ),
-        (UpTrainMetric.RESPONSE_MATCHING, {"ground_truths": [], "responses": []}, {"method": "llm"}),
+        (UpTrainMetric.RESPONSE_MATCHING, {"questions": [], "ground_truths": [], "responses": []}, {"method": "llm"}),
     ],
 )
 def test_evaluator_valid_inputs(metric, inputs, params):
@@ -304,7 +304,7 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params):
         ),
         (
             UpTrainMetric.RESPONSE_MATCHING,
-            {"ground_truths": ["g11"], "responses": ["r11"]},
+            {"questions": ["q11"], "ground_truths": ["g11"], "responses": ["r11"]},
             [
                 [
                     ("response_match_precision", 1.0, None),
@@ -375,6 +375,7 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params):
         (
             UpTrainMetric.RESPONSE_MATCHING,
             {
+                "questions": DEFAULT_QUESTIONS,
                 "ground_truths": [
                     "Consumerism is the most popular sport in the world",
                     "Python language was created by some dude.",

From ca319e8651f9396e125e412edad06b1b381617bd Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhruv@uptrain.ai>
Date: Tue, 19 Mar 2024 12:27:53 +0530
Subject: [PATCH 2/7] Use .from_env_var instead of .from_token

---
 integrations/uptrain/tests/test_evaluator.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py
index 418a006f0..81e7783aa 100644
--- a/integrations/uptrain/tests/test_evaluator.py
+++ b/integrations/uptrain/tests/test_evaluator.py
@@ -146,22 +146,22 @@ def test_evaluator_metric_init_params():
     eval = UpTrainEvaluator(
         UpTrainMetric.CRITIQUE_TONE,
         metric_params={"llm_persona": "village idiot"},
-        api_key=Secret.from_token("Aaa"),
+        api_key=Secret.from_env_var("OPENAI_API_KEY"),
     )
     assert eval._backend_metric.llm_persona == "village idiot"
 
     with pytest.raises(ValueError, match="Invalid init parameters"):
         UpTrainEvaluator(
-            UpTrainMetric.CRITIQUE_TONE, metric_params={"role": "village idiot"}, api_key=Secret.from_token("Aaa")
+            UpTrainMetric.CRITIQUE_TONE, metric_params={"role": "village idiot"}, api_key=Secret.from_env_var("OPENAI_API_KEY")
         )
 
     with pytest.raises(ValueError, match="unexpected init parameters"):
         UpTrainEvaluator(
-            UpTrainMetric.FACTUAL_ACCURACY, metric_params={"check_numbers": True}, api_key=Secret.from_token("Aaa")
+            UpTrainMetric.FACTUAL_ACCURACY, metric_params={"check_numbers": True}, api_key=Secret.from_env_var("OPENAI_API_KEY")
         )
 
     with pytest.raises(ValueError, match="expected init parameters"):
-        UpTrainEvaluator(UpTrainMetric.RESPONSE_MATCHING, api_key=Secret.from_token("Aaa"))
+        UpTrainEvaluator(UpTrainMetric.RESPONSE_MATCHING, api_key=Secret.from_env_var("OPENAI_API_KEY"))
 
 
 @patch("os.environ.get")
@@ -218,7 +218,7 @@ def test_evaluator_valid_inputs(metric, inputs, params):
     init_params = {
         "metric": metric,
         "metric_params": params,
-        "api_key": Secret.from_token("Aaa"),
+        "api_key": Secret.from_env_var("OPENAI_API_KEY"),
         "api_params": None,
     }
     eval = UpTrainEvaluator(**init_params)
@@ -245,7 +245,7 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params):
         init_params = {
             "metric": metric,
             "metric_params": params,
-            "api_key": Secret.from_token("Aaa"),
+            "api_key": Secret.from_env_var("OPENAI_API_KEY"),
             "api_params": None,
         }
         eval = UpTrainEvaluator(**init_params)
@@ -320,7 +320,7 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params):
     init_params = {
         "metric": metric,
         "metric_params": metric_params,
-        "api_key": Secret.from_token("Aaa"),
+        "api_key": Secret.from_env_var("OPENAI_API_KEY"),
         "api_params": None,
     }
     eval = UpTrainEvaluator(**init_params)

From 9b7d349fd8b2b542a2c99897df4bcc8438842874 Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhruv@uptrain.ai>
Date: Tue, 19 Mar 2024 12:39:39 +0530
Subject: [PATCH 3/7] Run black

---
 integrations/uptrain/tests/test_evaluator.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py
index 81e7783aa..e51daf16f 100644
--- a/integrations/uptrain/tests/test_evaluator.py
+++ b/integrations/uptrain/tests/test_evaluator.py
@@ -152,12 +152,16 @@ def test_evaluator_metric_init_params():
 
     with pytest.raises(ValueError, match="Invalid init parameters"):
         UpTrainEvaluator(
-            UpTrainMetric.CRITIQUE_TONE, metric_params={"role": "village idiot"}, api_key=Secret.from_env_var("OPENAI_API_KEY")
+            UpTrainMetric.CRITIQUE_TONE,
+            metric_params={"role": "village idiot"},
+            api_key=Secret.from_env_var("OPENAI_API_KEY"),
         )
 
     with pytest.raises(ValueError, match="unexpected init parameters"):
         UpTrainEvaluator(
-            UpTrainMetric.FACTUAL_ACCURACY, metric_params={"check_numbers": True}, api_key=Secret.from_env_var("OPENAI_API_KEY")
+            UpTrainMetric.FACTUAL_ACCURACY,
+            metric_params={"check_numbers": True},
+            api_key=Secret.from_env_var("OPENAI_API_KEY"),
         )
 
     with pytest.raises(ValueError, match="expected init parameters"):

From 22af43269d00f69dbb19ce636a9206b34f3b2aaa Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhruv@uptrain.ai>
Date: Fri, 12 Apr 2024 22:34:59 +0530
Subject: [PATCH 4/7] Refactor

---
 .../components/evaluators/uptrain/metrics.py        |  4 ++--
 integrations/uptrain/tests/test_evaluator.py        | 13 ++++++-------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py b/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py
index 9ddb60709..284f21682 100644
--- a/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py
+++ b/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py
@@ -6,7 +6,7 @@
 from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union
 
 from uptrain import CritiqueTone, Evals, GuidelineAdherence, ResponseMatching  # type: ignore
-from uptrain.framework.evals import ParametricEval
+from uptrain.framework.evalllm import ParametricEval
 
 
 class UpTrainMetric(Enum):
@@ -313,7 +313,7 @@ def critique_language(
     def critique_tone(
         output: Dict[str, Any], metric_params: Optional[Dict[str, Any]]  # noqa: ARG004
     ) -> List[MetricResult]:
-        return [OutputConverters._extract_default_results(output, "tone")]
+        return [OutputConverters._extract_default_results(output, "critique_tone")]
 
     @staticmethod
     def guideline_adherence(output: Dict[str, Any], metric_params: Optional[Dict[str, Any]]) -> List[MetricResult]:
diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py
index e51daf16f..e935a69f9 100644
--- a/integrations/uptrain/tests/test_evaluator.py
+++ b/integrations/uptrain/tests/test_evaluator.py
@@ -82,8 +82,8 @@ def log_and_evaluate(self, data, checks, **kwargs):
                 "explanation_politeness": "11",
             },
             UpTrainMetric.CRITIQUE_TONE: {
-                "score_tone": 0.4,
-                "explanation_tone": "12",
+                "score_critique_tone": 0.4,
+                "explanation_critique_tone": "12",
             },
             UpTrainMetric.GUIDELINE_ADHERENCE: {
                 "score_guideline_adherence": 1.0,
@@ -107,9 +107,8 @@ def test_evaluator_api(monkeypatch):
     monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
     monkeypatch.setenv("UPTRAIN_API_KEY", "test-api-key")
 
-    eval = UpTrainEvaluator(UpTrainMetric.RESPONSE_COMPLETENESS)
-    assert eval.api == "openai"
-    assert eval.api_key == Secret.from_env_var("OPENAI_API_KEY")
+    with pytest.raises(ValueError, match="OpenAI API Key is invalid"):
+        eval = UpTrainEvaluator(UpTrainMetric.RESPONSE_COMPLETENESS)
 
     eval = UpTrainEvaluator(
         UpTrainMetric.RESPONSE_COMPLETENESS,
@@ -127,7 +126,7 @@ def test_evaluator_api(monkeypatch):
     with pytest.raises(ValueError, match="None of the following authentication environment variables are set"):
         UpTrainEvaluator(UpTrainMetric.CONTEXT_RELEVANCE, api="uptrain", api_key=Secret.from_env_var("asd39920qqq"))
 
-    with pytest.raises(ValueError, match="does not support additional parameters"):
+    with pytest.raises(ValueError, match="OpenAI API Key is invalid"):
         UpTrainEvaluator(
             UpTrainMetric.CONTEXT_RELEVANCE,
             api_params={"project_name": "test"},
@@ -299,7 +298,7 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params):
             ],
             None,
         ),
-        (UpTrainMetric.CRITIQUE_TONE, {"responses": ["r9"]}, [[("tone", 0.4, "12")]], {"llm_persona": "idiot"}),
+        (UpTrainMetric.CRITIQUE_TONE, {"responses": ["r9"]}, [[("critique_tone", 0.4, "12")]], {"llm_persona": "idiot"}),
         (
             UpTrainMetric.GUIDELINE_ADHERENCE,
             {"questions": ["q10"], "responses": ["r10"]},

From 3e3fc46f27acfdbdf0ba4dcb9282b53dc678e364 Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhruv@uptrain.ai>
Date: Fri, 12 Apr 2024 22:54:12 +0530
Subject: [PATCH 5/7] Reorder tests

---
 integrations/uptrain/tests/test_evaluator.py | 125 ++++++++++---------
 1 file changed, 63 insertions(+), 62 deletions(-)

diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py
index e935a69f9..a6aff793d 100644
--- a/integrations/uptrain/tests/test_evaluator.py
+++ b/integrations/uptrain/tests/test_evaluator.py
@@ -103,6 +103,69 @@ def log_and_evaluate(self, data, checks, **kwargs):
         return data
 
 
+# This integration test validates the evaluator by running it against the
+# OpenAI API. It is parameterized by the metric, the inputs to the evalutor
+# and the metric parameters.
+@pytest.mark.integration
+@pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY not set")
+@pytest.mark.parametrize(
+    "metric, inputs, metric_params",
+    [
+        (UpTrainMetric.CONTEXT_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS}, None),
+        (
+            UpTrainMetric.FACTUAL_ACCURACY,
+            {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
+            None,
+        ),
+        (UpTrainMetric.RESPONSE_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
+        (UpTrainMetric.RESPONSE_COMPLETENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
+        (
+            UpTrainMetric.RESPONSE_COMPLETENESS_WRT_CONTEXT,
+            {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
+            None,
+        ),
+        (
+            UpTrainMetric.RESPONSE_CONSISTENCY,
+            {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
+            None,
+        ),
+        (UpTrainMetric.RESPONSE_CONCISENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
+        (UpTrainMetric.CRITIQUE_LANGUAGE, {"responses": DEFAULT_RESPONSES}, None),
+        (UpTrainMetric.CRITIQUE_TONE, {"responses": DEFAULT_RESPONSES}, {"llm_persona": "idiot"}),
+        (
+            UpTrainMetric.GUIDELINE_ADHERENCE,
+            {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES},
+            {"guideline": "Do nothing", "guideline_name": "somename", "response_schema": None},
+        ),
+        (
+            UpTrainMetric.RESPONSE_MATCHING,
+            {
+                "questions": DEFAULT_QUESTIONS,
+                "ground_truths": [
+                    "Consumerism is the most popular sport in the world",
+                    "Python language was created by some dude.",
+                ],
+                "responses": DEFAULT_RESPONSES,
+            },
+            {"method": "llm"},
+        ),
+    ],
+)
+def test_integration_run(metric, inputs, metric_params):
+    init_params = {
+        "metric": metric,
+        "metric_params": metric_params,
+        "api": "openai",
+    }
+    eval = UpTrainEvaluator(**init_params)
+    output = eval.run(**inputs)
+
+    assert type(output) == dict
+    assert len(output) == 1
+    assert "results" in output
+    assert len(output["results"]) == len(next(iter(inputs.values())))
+
+
 def test_evaluator_api(monkeypatch):
     monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
     monkeypatch.setenv("UPTRAIN_API_KEY", "test-api-key")
@@ -340,65 +403,3 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params):
         got = {(x["name"], x["score"], x["explanation"]) for x in r}
         assert got == expected
 
-
-# This integration test validates the evaluator by running it against the
-# OpenAI API. It is parameterized by the metric, the inputs to the evalutor
-# and the metric parameters.
-@pytest.mark.integration
-@pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY not set")
-@pytest.mark.parametrize(
-    "metric, inputs, metric_params",
-    [
-        (UpTrainMetric.CONTEXT_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS}, None),
-        (
-            UpTrainMetric.FACTUAL_ACCURACY,
-            {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
-            None,
-        ),
-        (UpTrainMetric.RESPONSE_RELEVANCE, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
-        (UpTrainMetric.RESPONSE_COMPLETENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
-        (
-            UpTrainMetric.RESPONSE_COMPLETENESS_WRT_CONTEXT,
-            {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
-            None,
-        ),
-        (
-            UpTrainMetric.RESPONSE_CONSISTENCY,
-            {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES},
-            None,
-        ),
-        (UpTrainMetric.RESPONSE_CONCISENESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES}, None),
-        (UpTrainMetric.CRITIQUE_LANGUAGE, {"responses": DEFAULT_RESPONSES}, None),
-        (UpTrainMetric.CRITIQUE_TONE, {"responses": DEFAULT_RESPONSES}, {"llm_persona": "idiot"}),
-        (
-            UpTrainMetric.GUIDELINE_ADHERENCE,
-            {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES},
-            {"guideline": "Do nothing", "guideline_name": "somename", "response_schema": None},
-        ),
-        (
-            UpTrainMetric.RESPONSE_MATCHING,
-            {
-                "questions": DEFAULT_QUESTIONS,
-                "ground_truths": [
-                    "Consumerism is the most popular sport in the world",
-                    "Python language was created by some dude.",
-                ],
-                "responses": DEFAULT_RESPONSES,
-            },
-            {"method": "llm"},
-        ),
-    ],
-)
-def test_integration_run(metric, inputs, metric_params):
-    init_params = {
-        "metric": metric,
-        "metric_params": metric_params,
-        "api": "openai",
-    }
-    eval = UpTrainEvaluator(**init_params)
-    output = eval.run(**inputs)
-
-    assert type(output) == dict
-    assert len(output) == 1
-    assert "results" in output
-    assert len(output["results"]) == len(next(iter(inputs.values())))

From 707cda8f27692a7313c304ae8288ff251c06a1d2 Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhruv@uptrain.ai>
Date: Fri, 12 Apr 2024 23:01:52 +0530
Subject: [PATCH 6/7] Run black

---
 integrations/uptrain/tests/test_evaluator.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py
index a6aff793d..4e91c5e30 100644
--- a/integrations/uptrain/tests/test_evaluator.py
+++ b/integrations/uptrain/tests/test_evaluator.py
@@ -361,7 +361,12 @@ def test_evaluator_invalid_inputs(metric, inputs, error_string, params):
             ],
             None,
         ),
-        (UpTrainMetric.CRITIQUE_TONE, {"responses": ["r9"]}, [[("critique_tone", 0.4, "12")]], {"llm_persona": "idiot"}),
+        (
+            UpTrainMetric.CRITIQUE_TONE,
+            {"responses": ["r9"]},
+            [[("critique_tone", 0.4, "12")]],
+            {"llm_persona": "idiot"},
+        ),
         (
             UpTrainMetric.GUIDELINE_ADHERENCE,
             {"questions": ["q10"], "responses": ["r10"]},
@@ -402,4 +407,3 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params):
         expected = {(name if name is not None else str(metric), score, exp) for name, score, exp in o}
         got = {(x["name"], x["score"], x["explanation"]) for x in r}
         assert got == expected
-

From f4113f038f58845647f361d009e0acfb60570792 Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhruv@uptrain.ai>
Date: Fri, 12 Apr 2024 23:11:39 +0530
Subject: [PATCH 7/7] Add api_key to init_params

---
 integrations/uptrain/tests/test_evaluator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py
index 4e91c5e30..30e804b2f 100644
--- a/integrations/uptrain/tests/test_evaluator.py
+++ b/integrations/uptrain/tests/test_evaluator.py
@@ -156,6 +156,7 @@ def test_integration_run(metric, inputs, metric_params):
         "metric": metric,
         "metric_params": metric_params,
         "api": "openai",
+        "api_key": Secret.from_env_var("OPENAI_API_KEY"),
     }
     eval = UpTrainEvaluator(**init_params)
     output = eval.run(**inputs)