diff --git a/README.md b/README.md index a3ef99b2..ab7c7617 100644 --- a/README.md +++ b/README.md @@ -26,23 +26,25 @@ $ pip install -U haystack-experimental ## Experiments lifecycle -Any experimental feature will be removed from `haystack-experimental` after a period of 3 months. After this time, -the experiment will be either: -- Merged into Haystack core and published in the next minor release -- Released as a Core Integration, +Each experimental feature has a default lifespan of 3 months starting from the date of the first non-pre-release build +that includes it. Once it reaches the end of its lifespan, the experiment will be either: +- Merged into Haystack core and published in the next minor release, or +- Released as a Core Integration, or - Dropped. ## Experiments catalog The latest version of the package contains the following experiments: -| Name | Type | Experiment end date | -|--------------------------|-------------------------| ------------------- | -| [`EvaluationHarness`][1] | Evaluation orchestrator | August 2024 | -| [`OpenAPITool`][2] | OpenAPITool component | August 2024 | +| Name | Type | Expected experiment end date | +| ------------------------ | ----------------------- | ------------------- | +| [`EvaluationHarness`][1] | Evaluation orchestrator | September 2024 | +| [`OpenAIFunctionCaller`][2] | Function Calling Component | September 2024 | +| [`OpenAPITool`][3] | OpenAPITool component | September 2024 | [1]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/evaluation/harness -[2]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/tools/openapi +[2]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/tools/openai +[3]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/tools/openapi ## Usage @@ -114,4 +116,4 @@ class Pipeline(HaystackPipeline): ## Contributing -Direct contributions to `haystack-experimental` are not expected, but Haystack maintainers might ask contributors to move pull requests that target the [core repository](https://github.com/deepset-ai/haystack) to this repository. \ No newline at end of file +Direct contributions to `haystack-experimental` are not expected, but Haystack maintainers might ask contributors to move pull requests that target the [core repository](https://github.com/deepset-ai/haystack) to this repository. diff --git a/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py b/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py index 08c295c8..581a3150 100644 --- a/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py +++ b/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py @@ -44,8 +44,8 @@ def default_rag_evaluation_pipeline( RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: partial( SASEvaluator, model="sentence-transformers/all-MiniLM-L6-v2" ), - RAGEvaluationMetric.ANSWER_FAITHFULNESS: FaithfulnessEvaluator, - RAGEvaluationMetric.CONTEXT_RELEVANCE: ContextRelevanceEvaluator, + RAGEvaluationMetric.ANSWER_FAITHFULNESS: partial(FaithfulnessEvaluator, raise_on_failure=False), + RAGEvaluationMetric.CONTEXT_RELEVANCE: partial(ContextRelevanceEvaluator, raise_on_failure=False), } for metric in metrics: diff --git a/haystack_experimental/evaluation/harness/rag/harness.py b/haystack_experimental/evaluation/harness/rag/harness.py index 50dd446f..9ea3cfae 100644 --- a/haystack_experimental/evaluation/harness/rag/harness.py +++ b/haystack_experimental/evaluation/harness/rag/harness.py @@ -131,23 +131,31 @@ def run( # noqa: D102 pipeline_outputs["second"], ) + result_inputs = { + "questions": inputs.queries, + "contexts": [ + [doc.content for doc in docs] + for docs in self._lookup_component_output( + RAGExpectedComponent.DOCUMENT_RETRIEVER, + rag_outputs, + "retrieved_documents", + ) + ], + "responses": self._lookup_component_output( + RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies" + ), + } + if inputs.ground_truth_answers is not None: + result_inputs["ground_truth_answers"] = inputs.ground_truth_answers + if inputs.ground_truth_documents is not None: + result_inputs["ground_truth_documents"] = [ + [doc.content for doc in docs] for docs in inputs.ground_truth_documents + ] + assert run_name is not None run_results = EvaluationRunResult( run_name, - inputs={ - "questions": inputs.queries, - "contexts": [ - [doc.content for doc in docs] - for docs in self._lookup_component_output( - RAGExpectedComponent.DOCUMENT_RETRIEVER, - rag_outputs, - "retrieved_documents", - ) - ], - "responses": self._lookup_component_output( - RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies" - ), - }, + inputs=result_inputs, results=eval_outputs, ) diff --git a/haystack_experimental/version.py b/haystack_experimental/version.py deleted file mode 100644 index 9079a6e4..00000000 --- a/haystack_experimental/version.py +++ /dev/null @@ -1,5 +0,0 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -__version__ = "0.0.1" diff --git a/pyproject.toml b/pyproject.toml index 94e4f9b1..19d086ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["hatchling>=1.8.0"] +requires = ["hatchling>=1.8.0", "hatch-vcs"] build-backend = "hatchling.build" [project] @@ -74,7 +74,8 @@ sync = "./.github/utils/pydoc-markdown.sh" delete-outdated = "python ./.github/utils/delete_outdated_docs.py {args}" [tool.hatch.version] -path = "haystack_experimental/version.py" +source = "vcs" +tag-pattern = 'v(?P.*)' [tool.hatch.metadata] allow-direct-references = true @@ -91,10 +92,7 @@ quiet-level = 3 skip = "test/nodes/*,test/others/*,test/samples/*,e2e/*" [tool.pylint] -ignore-paths = [ - "haystack_experimental/__init__.py", - "haystack_experimental/version.py", -] +ignore-paths = ["haystack_experimental/__init__.py"] [tool.pylint.'MESSAGES CONTROL'] max-line-length = 120 diff --git a/test/evaluation/harness/rag/test_harness.py b/test/evaluation/harness/rag/test_harness.py index 1ec9763f..b36e51d9 100644 --- a/test/evaluation/harness/rag/test_harness.py +++ b/test/evaluation/harness/rag/test_harness.py @@ -664,6 +664,62 @@ def test_run_model_based_metrics(self, monkeypatch): assert output.inputs == inputs assert output.results.run_name == "test_run" + assert output.results.inputs == { + "questions": ["What is the capital of France?"] * 6, + "contexts": [ + ["France"], + [ + "9th century", + "10th century", + "9th", + ], + [ + "classical", + "rock music", + "dubstep", + ], + [ + "11th", + "the 11th", + "11th century", + ], + [ + "Denmark", + "Norway", + "Iceland", + ], + [ + "10th century", + "the first half of the 10th century", + "10th", + "10th", + ], + ], + "responses": [ + "placeholder", + "placeholder", + "placeholder", + "placeholder", + "placeholder", + "placeholder", + ], + "ground_truth_documents": [ + ["France"], + ["9th century", "9th"], + ["classical music", "classical"], + ["11th century", "the 11th"], + ["Denmark, Iceland and Norway"], + ["10th century", "10th"], + ], + "ground_truth_answers": [ + "Paris is the capital of France.", + "9th century", + "classical music", + "11th century", + "Denmark, Iceland and Norway", + "10th century", + ], + } assert output.results.results == { "metric_answer_faithfulness": MockModelBasedEvaluator.default_output( RAGEvaluationMetric.ANSWER_FAITHFULNESS