From 84013849276f25e3397489915d11a5d8cceb3b09 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Thu, 4 Jul 2024 16:02:38 +0200 Subject: [PATCH] feat/refactor: Allow pipelines without generators to be used with the RAG eval harness (#31) --- .../evaluation/harness/rag/__init__.py | 3 +- .../evaluation/harness/rag/harness.py | 237 ++++++++++++------ .../evaluation/harness/rag/parameters.py | 10 +- test/evaluation/harness/rag/test_harness.py | 65 ++++- 4 files changed, 217 insertions(+), 98 deletions(-) diff --git a/haystack_experimental/evaluation/harness/rag/__init__.py b/haystack_experimental/evaluation/harness/rag/__init__.py index cc714697..fb7009f7 100644 --- a/haystack_experimental/evaluation/harness/rag/__init__.py +++ b/haystack_experimental/evaluation/harness/rag/__init__.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from .harness import RAGEvaluationHarness +from .harness import DefaultRAGArchitecture, RAGEvaluationHarness from .parameters import ( RAGEvaluationInput, RAGEvaluationMetric, @@ -13,6 +13,7 @@ ) _all_ = [ + "DefaultRAGArchitecture", "RAGEvaluationHarness", "RAGExpectedComponent", "RAGExpectedComponentMetadata", diff --git a/haystack_experimental/evaluation/harness/rag/harness.py b/haystack_experimental/evaluation/harness/rag/harness.py index b66d9aa1..f76c48c3 100644 --- a/haystack_experimental/evaluation/harness/rag/harness.py +++ b/haystack_experimental/evaluation/harness/rag/harness.py @@ -3,7 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from copy import deepcopy -from typing import Any, Dict, List, Optional, Set +from enum import Enum +from typing import Any, Dict, List, Optional, Set, Union from haystack import Pipeline from haystack.evaluation.eval_run_result import EvaluationRunResult @@ -25,6 +26,83 @@ ) +class DefaultRAGArchitecture(Enum): + """ + Represents default RAG pipeline architectures that can be used with the evaluation harness. + """ + + #: A RAG pipeline with: + #: - A query embedder component named 'query_embedder' with a 'text' input. + #: - A document retriever component named 'retriever' with a 'documents' output. + EMBEDDING_RETRIEVAL = "embedding_retrieval" + + #: A RAG pipeline with: + #: - A document retriever component named 'retriever' with a 'query' input and a 'documents' output. + KEYWORD_RETRIEVAL = "keyword_retrieval" + + #: A RAG pipeline with: + #: - A query embedder component named 'query_embedder' with a 'text' input. + #: - A document retriever component named 'retriever' with a 'documents' output. + #: - A response generator component named 'generator' with a 'replies' output. + GENERATION_WITH_EMBEDDING_RETRIEVAL = "generation_with_embedding_retrieval" + + #: A RAG pipeline with: + #: - A document retriever component named 'retriever' with a 'query' input and a 'documents' output. + #: - A response generator component named 'generator' with a 'replies' output. + GENERATION_WITH_KEYWORD_RETRIEVAL = "generation_with_keyword_retrieval" + + @property + def expected_components( + self, + ) -> Dict[RAGExpectedComponent, RAGExpectedComponentMetadata]: + """ + Returns the expected components for the architecture. + + :returns: + The expected components. + """ + if self in ( + DefaultRAGArchitecture.EMBEDDING_RETRIEVAL, + DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, + ): + expected = { + RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata( + name="query_embedder", input_mapping={"query": "text"} + ), + RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata( + name="retriever", + output_mapping={"retrieved_documents": "documents"}, + ), + } + elif self in ( + DefaultRAGArchitecture.KEYWORD_RETRIEVAL, + DefaultRAGArchitecture.GENERATION_WITH_KEYWORD_RETRIEVAL, + ): + expected = { + RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata( + name="retriever", input_mapping={"query": "query"} + ), + RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata( + name="retriever", + output_mapping={"retrieved_documents": "documents"}, + ), + } + else: + raise NotImplementedError(f"Unexpected default RAG architecture: {self}") + + if self in ( + DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, + DefaultRAGArchitecture.GENERATION_WITH_KEYWORD_RETRIEVAL, + ): + expected[RAGExpectedComponent.RESPONSE_GENERATOR] = ( + RAGExpectedComponentMetadata( + name="generator", output_mapping={"replies": "replies"} + ) + ) + + return expected + + class RAGEvaluationHarness( EvaluationHarness[RAGEvaluationInput, RAGEvaluationOverrides, RAGEvaluationOutput] ): @@ -35,7 +113,10 @@ class RAGEvaluationHarness( def __init__( self, rag_pipeline: Pipeline, - rag_components: Dict[RAGExpectedComponent, RAGExpectedComponentMetadata], + rag_components: Union[ + DefaultRAGArchitecture, + Dict[RAGExpectedComponent, RAGExpectedComponentMetadata], + ], metrics: Set[RAGEvaluationMetric], ): """ @@ -44,76 +125,23 @@ def __init__( :param rag_pipeline: The RAG pipeline to evaluate. :param rag_components: - A mapping of expected components to their metadata. + Either a default RAG architecture or a mapping + of expected components to their metadata. :param metrics: The metrics to use during evaluation. """ super().__init__() - self._validate_rag_components(rag_pipeline, rag_components) + if isinstance(rag_components, DefaultRAGArchitecture): + rag_components = rag_components.expected_components + + self._validate_rag_components(rag_pipeline, rag_components, metrics) self.rag_pipeline = rag_pipeline - self.rag_components = rag_components - self.metrics = metrics + self.rag_components = deepcopy(rag_components) + self.metrics = deepcopy(metrics) self.evaluation_pipeline = default_rag_evaluation_pipeline(metrics) - @classmethod - def default_with_embedding_retriever( - cls, rag_pipeline: Pipeline, metrics: Set[RAGEvaluationMetric] - ) -> "RAGEvaluationHarness": - """ - Create a default evaluation harness for evaluating RAG pipelines with a query embedder. - - :param rag_pipeline: - The RAG pipeline to evaluate. The following assumptions are made: - - The query embedder component is named 'query_embedder' and has a 'text' input. - - The document retriever component is named 'retriever' and has a 'documents' output. - - The response generator component is named 'generator' and has a 'replies' output. - :param metrics: - The metrics to use during evaluation. - """ - rag_components = { - RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata( - name="query_embedder", input_mapping={"query": "text"} - ), - RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata( - name="retriever", output_mapping={"retrieved_documents": "documents"} - ), - RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata( - name="generator", output_mapping={"replies": "replies"} - ), - } - - return cls(rag_pipeline, rag_components, deepcopy(metrics)) - - @classmethod - def default_with_keyword_retriever( - cls, rag_pipeline: Pipeline, metrics: Set[RAGEvaluationMetric] - ) -> "RAGEvaluationHarness": - """ - Create a default evaluation harness for evaluating RAG pipelines with a keyword retriever. - - :param rag_pipeline: - The RAG pipeline to evaluate. The following assumptions are made: - - The document retriever component is named 'retriever' and has a 'query' input and a 'documents' output. - - The response generator component is named 'generator' and has a 'replies' output. - :param metrics: - The metrics to use during evaluation. - """ - rag_components = { - RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata( - name="retriever", input_mapping={"query": "query"} - ), - RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata( - name="retriever", output_mapping={"retrieved_documents": "documents"} - ), - RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata( - name="generator", output_mapping={"replies": "replies"} - ), - } - - return cls(rag_pipeline, rag_components, deepcopy(metrics)) - def run( # noqa: D102 self, inputs: RAGEvaluationInput, @@ -141,10 +169,12 @@ def run( # noqa: D102 "retrieved_documents", ) ], - "responses": self._lookup_component_output( - RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies" - ), } + if RAGExpectedComponent.RESPONSE_GENERATOR in self.rag_components: + result_inputs["responses"] = self._lookup_component_output( + RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies" + ) + if inputs.ground_truth_answers is not None: result_inputs["ground_truth_answers"] = inputs.ground_truth_answers if inputs.ground_truth_documents is not None: @@ -199,6 +229,14 @@ def _generate_eval_run_pipelines( rag_pipeline = self._override_pipeline(self.rag_pipeline, rag_overrides) eval_pipeline = self._override_pipeline(self.evaluation_pipeline, eval_overrides) # type: ignore + included_first_outputs = { + self.rag_components[RAGExpectedComponent.DOCUMENT_RETRIEVER].name + } + if RAGExpectedComponent.RESPONSE_GENERATOR in self.rag_components: + included_first_outputs.add( + self.rag_components[RAGExpectedComponent.RESPONSE_GENERATOR].name + ) + return PipelinePair( first=rag_pipeline, second=eval_pipeline, @@ -206,10 +244,7 @@ def _generate_eval_run_pipelines( map_first_outputs=lambda x: self._aggregate_rag_outputs( # pylint: disable=unnecessary-lambda x ), - included_first_outputs={ - self.rag_components[RAGExpectedComponent.DOCUMENT_RETRIEVER].name, - self.rag_components[RAGExpectedComponent.RESPONSE_GENERATOR].name, - }, + included_first_outputs=included_first_outputs, ) def _aggregate_rag_outputs( @@ -217,16 +252,17 @@ def _aggregate_rag_outputs( ) -> Dict[str, Dict[str, Any]]: aggregate = aggregate_batched_pipeline_outputs(outputs) - # We only care about the first response from the generator. - generator_name = self.rag_components[ - RAGExpectedComponent.RESPONSE_GENERATOR - ].name - replies_output_name = self.rag_components[ - RAGExpectedComponent.RESPONSE_GENERATOR - ].output_mapping["replies"] - aggregate[generator_name][replies_output_name] = [ - r[0] for r in aggregate[generator_name][replies_output_name] - ] + if RAGExpectedComponent.RESPONSE_GENERATOR in self.rag_components: + # We only care about the first response from the generator. + generator_name = self.rag_components[ + RAGExpectedComponent.RESPONSE_GENERATOR + ].name + replies_output_name = self.rag_components[ + RAGExpectedComponent.RESPONSE_GENERATOR + ].output_mapping["replies"] + aggregate[generator_name][replies_output_name] = [ + r[0] for r in aggregate[generator_name][replies_output_name] + ] return aggregate @@ -383,11 +419,46 @@ def _prepare_eval_pipeline_additional_inputs( def _validate_rag_components( pipeline: Pipeline, components: Dict[RAGExpectedComponent, RAGExpectedComponentMetadata], + metrics: Set[RAGEvaluationMetric], ): - for e in RAGExpectedComponent: - if e not in components: + metric_specific_required_components = { + RAGEvaluationMetric.DOCUMENT_MAP: [ + RAGExpectedComponent.QUERY_PROCESSOR, + RAGExpectedComponent.DOCUMENT_RETRIEVER, + ], + RAGEvaluationMetric.DOCUMENT_MRR: [ + RAGExpectedComponent.QUERY_PROCESSOR, + RAGExpectedComponent.DOCUMENT_RETRIEVER, + ], + RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: [ + RAGExpectedComponent.QUERY_PROCESSOR, + RAGExpectedComponent.DOCUMENT_RETRIEVER, + ], + RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: [ + RAGExpectedComponent.QUERY_PROCESSOR, + RAGExpectedComponent.DOCUMENT_RETRIEVER, + ], + RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: [ + RAGExpectedComponent.QUERY_PROCESSOR, + RAGExpectedComponent.RESPONSE_GENERATOR, + ], + RAGEvaluationMetric.FAITHFULNESS: [ + RAGExpectedComponent.QUERY_PROCESSOR, + RAGExpectedComponent.DOCUMENT_RETRIEVER, + RAGExpectedComponent.RESPONSE_GENERATOR, + ], + RAGEvaluationMetric.CONTEXT_RELEVANCE: [ + RAGExpectedComponent.QUERY_PROCESSOR, + RAGExpectedComponent.DOCUMENT_RETRIEVER, + ], + } + + for m in metrics: + required_components = metric_specific_required_components[m] + if not all(c in components for c in required_components): raise ValueError( - f"RAG evaluation harness requires metadata for the '{e.value}' component." + f"In order to use the metric '{m}', the RAG evaluation harness requires metadata " + f"for the following components: {required_components}" ) pipeline_outputs = pipeline.outputs( diff --git a/haystack_experimental/evaluation/harness/rag/parameters.py b/haystack_experimental/evaluation/harness/rag/parameters.py index abae4251..638e4227 100644 --- a/haystack_experimental/evaluation/harness/rag/parameters.py +++ b/haystack_experimental/evaluation/harness/rag/parameters.py @@ -12,7 +12,7 @@ class RAGExpectedComponent(Enum): """ - Represents the basic components in a RAG pipeline that needs to be present for evaluation. + Represents the basic components in a RAG pipeline that are, by default, required to be present for evaluation. Each of these can be separate components in the pipeline or a single component that performs multiple tasks. @@ -27,6 +27,7 @@ class RAGExpectedComponent(Enum): DOCUMENT_RETRIEVER = "document_retriever" #: The component in a RAG pipeline that generates responses based on the query and the retrieved documents. + #: Can be optional if the harness is only evaluating retrieval. #: Expected outputs: `replies` - Name of out containing the LLM responses. Only the first response is used. RESPONSE_GENERATOR = "response_generator" @@ -57,24 +58,31 @@ class RAGEvaluationMetric(Enum): """ #: Document Mean Average Precision. + #: Required RAG components: Query Processor, Document Retriever. DOCUMENT_MAP = "metric_doc_map" #: Document Mean Reciprocal Rank. + #: Required RAG components: Query Processor, Document Retriever. DOCUMENT_MRR = "metric_doc_mrr" #: Document Recall with a single hit. + #: Required RAG components: Query Processor, Document Retriever. DOCUMENT_RECALL_SINGLE_HIT = "metric_doc_recall_single" #: Document Recall with multiple hits. + #: Required RAG components: Query Processor, Document Retriever. DOCUMENT_RECALL_MULTI_HIT = "metric_doc_recall_multi" #: Semantic Answer Similarity. + #: Required RAG components: Query Processor, Response Generator. SEMANTIC_ANSWER_SIMILARITY = "metric_sas" #: Faithfulness. + #: Required RAG components: Query Processor, Document Retriever, Response Generator. FAITHFULNESS = "metric_faithfulness" #: Context Relevance. + #: Required RAG components: Query Processor, Document Retriever. CONTEXT_RELEVANCE = "metric_context_relevance" diff --git a/test/evaluation/harness/rag/test_harness.py b/test/evaluation/harness/rag/test_harness.py index 84a78741..65e528c2 100644 --- a/test/evaluation/harness/rag/test_harness.py +++ b/test/evaluation/harness/rag/test_harness.py @@ -5,6 +5,7 @@ import pytest from haystack_experimental.evaluation.harness.rag import ( + DefaultRAGArchitecture, RAGEvaluationHarness, RAGExpectedComponent, RAGExpectedComponentMetadata, @@ -377,12 +378,27 @@ def test_init_invalid_missing_outputs(self, rag_pipeline): def test_init_defaults( self, rag_pipeline_with_query_embedder, rag_pipeline_with_keyword_retriever ): - _ = RAGEvaluationHarness.default_with_embedding_retriever( - rag_pipeline_with_query_embedder, metrics={RAGEvaluationMetric.DOCUMENT_MAP} + _ = RAGEvaluationHarness( + rag_pipeline_with_query_embedder, + DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, ) - _ = RAGEvaluationHarness.default_with_keyword_retriever( + _ = RAGEvaluationHarness( rag_pipeline_with_keyword_retriever, + DefaultRAGArchitecture.GENERATION_WITH_KEYWORD_RETRIEVAL, + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, + ) + + _ = RAGEvaluationHarness( + rag_pipeline_with_query_embedder, + DefaultRAGArchitecture.EMBEDDING_RETRIEVAL, + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, + ) + + _ = RAGEvaluationHarness( + rag_pipeline_with_keyword_retriever, + DefaultRAGArchitecture.KEYWORD_RETRIEVAL, metrics={RAGEvaluationMetric.DOCUMENT_MAP}, ) @@ -393,10 +409,11 @@ def test_init_defaults_invalid_missing_inputs( ValueError, match="Required input 'text' not found in 'query_processor' component named 'query_embedder'", ): - _ = RAGEvaluationHarness.default_with_embedding_retriever( + _ = RAGEvaluationHarness( build_rag_pipeline_with_query_embedder( embedder_name="llm", generator_name="query_embedder" ), + DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, metrics={RAGEvaluationMetric.DOCUMENT_MAP}, ) @@ -404,10 +421,11 @@ def test_init_defaults_invalid_missing_inputs( ValueError, match="Required input 'query' not found in 'query_processor' component named 'retriever'", ): - _ = RAGEvaluationHarness.default_with_keyword_retriever( + _ = RAGEvaluationHarness( build_rag_pipeline_with_keyword_retriever( retriever_name="llm", generator_name="retriever" ), + DefaultRAGArchitecture.GENERATION_WITH_KEYWORD_RETRIEVAL, metrics={RAGEvaluationMetric.DOCUMENT_MAP}, ) @@ -432,8 +450,9 @@ def test_init_defaults_invalid_missing_outputs(self): ValueError, match="Required output 'replies' not found in 'response_generator' component named 'generator'", ): - _ = RAGEvaluationHarness.default_with_embedding_retriever( + _ = RAGEvaluationHarness( non_conformant_query_embedder_pipeline, + DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, metrics={RAGEvaluationMetric.DOCUMENT_MAP}, ) @@ -441,20 +460,36 @@ def test_init_defaults_invalid_missing_outputs(self): ValueError, match="Required output 'documents' not found in 'document_retriever' component named 'retriever'", ): - _ = RAGEvaluationHarness.default_with_keyword_retriever( + _ = RAGEvaluationHarness( non_conformant_keyword_retriever_pipeline, + DefaultRAGArchitecture.GENERATION_WITH_KEYWORD_RETRIEVAL, metrics={RAGEvaluationMetric.DOCUMENT_MAP}, ) + def test_init_invalid_component_for_metric(self, rag_pipeline_with_query_embedder): + with pytest.raises( + ValueError, + match="In order to use the metric .* RAG evaluation harness requires metadata", + ): + _ = RAGEvaluationHarness( + rag_pipeline_with_query_embedder, + DefaultRAGArchitecture.EMBEDDING_RETRIEVAL, + metrics={ + RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY, + }, + ) + def test_run_invalid_ground_truths(self, rag_pipeline_with_query_embedder): - harness_map = RAGEvaluationHarness.default_with_embedding_retriever( + harness_map = RAGEvaluationHarness( rag_pipeline_with_query_embedder, + DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, metrics={ RAGEvaluationMetric.DOCUMENT_MAP, }, ) - harness_sas = RAGEvaluationHarness.default_with_embedding_retriever( + harness_sas = RAGEvaluationHarness( rag_pipeline_with_query_embedder, + DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, metrics={ RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY, }, @@ -502,8 +537,9 @@ def test_run_invalid_additional_input( self, rag_pipeline_with_query_embedder, ): - harness = RAGEvaluationHarness.default_with_embedding_retriever( + harness = RAGEvaluationHarness( rag_pipeline_with_query_embedder, + DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, metrics={ RAGEvaluationMetric.DOCUMENT_MAP, }, @@ -527,8 +563,9 @@ def test_run_invalid_override( self, rag_pipeline_with_query_embedder, ): - harness = RAGEvaluationHarness.default_with_embedding_retriever( + harness = RAGEvaluationHarness( rag_pipeline_with_query_embedder, + DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, metrics={ RAGEvaluationMetric.DOCUMENT_MAP, }, @@ -574,12 +611,13 @@ def test_run_statistical_metrics(self): RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT, RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT, } - harness = RAGEvaluationHarness.default_with_keyword_retriever( + harness = RAGEvaluationHarness( build_rag_pipeline_with_keyword_retriever( retriever_component=MockKeywordRetriever(), generator_component=MockGenerator(arg=0), generator_name="generator", ), + DefaultRAGArchitecture.KEYWORD_RETRIEVAL, metrics=metrics, ) @@ -630,12 +668,13 @@ def test_run_model_based_metrics(self, monkeypatch): RAGEvaluationMetric.CONTEXT_RELEVANCE, RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY, } - harness = RAGEvaluationHarness.default_with_keyword_retriever( + harness = RAGEvaluationHarness( build_rag_pipeline_with_keyword_retriever( retriever_component=MockKeywordRetriever(), generator_component=MockGenerator(arg=0), generator_name="generator", ), + DefaultRAGArchitecture.GENERATION_WITH_KEYWORD_RETRIEVAL, metrics=metrics, )