Merge branch 'main' into openapi

deepset-ai · Jun 24, 2024 · d714a6f · d714a6f
2 parents 7eb140f + f4c29d8
commit d714a6f
Show file tree

Hide file tree

Showing 6 changed files with 96 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -26,23 +26,25 @@ $ pip install -U haystack-experimental
 
 ## Experiments lifecycle
 
-Any experimental feature will be removed from `haystack-experimental` after a period of 3 months. After this time,
-the experiment will be either:
-- Merged into Haystack core and published in the next minor release
-- Released as a Core Integration,
+Each experimental feature has a default lifespan of 3 months starting from the date of the first non-pre-release build 
+that includes it. Once it reaches the end of its lifespan, the experiment will be either:
+- Merged into Haystack core and published in the next minor release, or
+- Released as a Core Integration, or
 - Dropped.
 
 ## Experiments catalog
 
 The latest version of the package contains the following experiments:
 
-| Name                     | Type                    | Experiment end date |
-|--------------------------|-------------------------| ------------------- |
-| [`EvaluationHarness`][1] | Evaluation orchestrator | August 2024         |
-| [`OpenAPITool`][2]       | OpenAPITool component   | August 2024         |
+| Name                     | Type                    | Expected experiment end date |
+| ------------------------ | ----------------------- | ------------------- |
+| [`EvaluationHarness`][1] | Evaluation orchestrator | September 2024         |
+| [`OpenAIFunctionCaller`][2] | Function Calling Component | September 2024         |
+| [`OpenAPITool`][3]       | OpenAPITool component   | September 2024         |
 
 [1]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/evaluation/harness
-[2]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/tools/openapi
+[2]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/tools/openai
+[3]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/tools/openapi
 
 ## Usage
 
@@ -114,4 +116,4 @@ class Pipeline(HaystackPipeline):
 
 ## Contributing
 
-Direct contributions to `haystack-experimental` are not expected, but Haystack maintainers might ask contributors to move pull requests that target the [core repository](https://github.com/deepset-ai/haystack) to this repository.
+Direct contributions to `haystack-experimental` are not expected, but Haystack maintainers might ask contributors to move pull requests that target the [core repository](https://github.com/deepset-ai/haystack) to this repository.
diff --git a/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py b/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py
@@ -44,8 +44,8 @@ def default_rag_evaluation_pipeline(
         RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: partial(
             SASEvaluator, model="sentence-transformers/all-MiniLM-L6-v2"
         ),
-        RAGEvaluationMetric.ANSWER_FAITHFULNESS: FaithfulnessEvaluator,
-        RAGEvaluationMetric.CONTEXT_RELEVANCE: ContextRelevanceEvaluator,
+        RAGEvaluationMetric.ANSWER_FAITHFULNESS: partial(FaithfulnessEvaluator, raise_on_failure=False),
+        RAGEvaluationMetric.CONTEXT_RELEVANCE: partial(ContextRelevanceEvaluator, raise_on_failure=False),
     }
 
     for metric in metrics:

diff --git a/haystack_experimental/evaluation/harness/rag/harness.py b/haystack_experimental/evaluation/harness/rag/harness.py
@@ -131,23 +131,31 @@ def run(  # noqa: D102
             pipeline_outputs["second"],
         )
 
+        result_inputs = {
+            "questions": inputs.queries,
+            "contexts": [
+                [doc.content for doc in docs]
+                for docs in self._lookup_component_output(
+                    RAGExpectedComponent.DOCUMENT_RETRIEVER,
+                    rag_outputs,
+                    "retrieved_documents",
+                )
+            ],
+            "responses": self._lookup_component_output(
+                RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies"
+            ),
+        }
+        if inputs.ground_truth_answers is not None:
+            result_inputs["ground_truth_answers"] = inputs.ground_truth_answers
+        if inputs.ground_truth_documents is not None:
+            result_inputs["ground_truth_documents"] = [
+                [doc.content for doc in docs] for docs in inputs.ground_truth_documents
+            ]
+
         assert run_name is not None
         run_results = EvaluationRunResult(
             run_name,
-            inputs={
-                "questions": inputs.queries,
-                "contexts": [
-                    [doc.content for doc in docs]
-                    for docs in self._lookup_component_output(
-                        RAGExpectedComponent.DOCUMENT_RETRIEVER,
-                        rag_outputs,
-                        "retrieved_documents",
-                    )
-                ],
-                "responses": self._lookup_component_output(
-                    RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies"
-                ),
-            },
+            inputs=result_inputs,
             results=eval_outputs,
         )
 

diff --git a/haystack_experimental/version.py b/haystack_experimental/version.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["hatchling>=1.8.0"]
+requires = ["hatchling>=1.8.0", "hatch-vcs"]
 build-backend = "hatchling.build"
 
 [project]
@@ -74,7 +74,8 @@ sync = "./.github/utils/pydoc-markdown.sh"
 delete-outdated = "python ./.github/utils/delete_outdated_docs.py {args}"
 
 [tool.hatch.version]
-path = "haystack_experimental/version.py"
+source = "vcs"
+tag-pattern = 'v(?P<version>.*)'
 
 [tool.hatch.metadata]
 allow-direct-references = true
@@ -91,10 +92,7 @@ quiet-level = 3
 skip = "test/nodes/*,test/others/*,test/samples/*,e2e/*"
 
 [tool.pylint]
-ignore-paths = [
-  "haystack_experimental/__init__.py",
-  "haystack_experimental/version.py",
-]
+ignore-paths = ["haystack_experimental/__init__.py"]
 
 [tool.pylint.'MESSAGES CONTROL']
 max-line-length = 120

diff --git a/test/evaluation/harness/rag/test_harness.py b/test/evaluation/harness/rag/test_harness.py
@@ -664,6 +664,62 @@ def test_run_model_based_metrics(self, monkeypatch):
 
         assert output.inputs == inputs
         assert output.results.run_name == "test_run"
+        assert output.results.inputs == {
+            "questions": ["What is the capital of France?"] * 6,
+            "contexts": [
+                ["France"],
+                [
+                    "9th century",
+                    "10th century",
+                    "9th",
+                ],
+                [
+                    "classical",
+                    "rock music",
+                    "dubstep",
+                ],
+                [
+                    "11th",
+                    "the 11th",
+                    "11th century",
+                ],
+                [
+                    "Denmark",
+                    "Norway",
+                    "Iceland",
+                ],
+                [
+                    "10th century",
+                    "the first half of the 10th century",
+                    "10th",
+                    "10th",
+                ],
+            ],
+            "responses": [
+                "placeholder",
+                "placeholder",
+                "placeholder",
+                "placeholder",
+                "placeholder",
+                "placeholder",
+            ],
+            "ground_truth_documents": [
+                ["France"],
+                ["9th century", "9th"],
+                ["classical music", "classical"],
+                ["11th century", "the 11th"],
+                ["Denmark, Iceland and Norway"],
+                ["10th century", "10th"],
+            ],
+            "ground_truth_answers": [
+                "Paris is the capital of France.",
+                "9th century",
+                "classical music",
+                "11th century",
+                "Denmark, Iceland and Norway",
+                "10th century",
+            ],
+        }
         assert output.results.results == {
             "metric_answer_faithfulness": MockModelBasedEvaluator.default_output(
                 RAGEvaluationMetric.ANSWER_FAITHFULNESS