From 4eebc290a0eb54c66f1b83289556b29ea59bb7d2 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Tue, 2 Apr 2024 22:25:24 +0200
Subject: [PATCH 01/15] adding files

---
 examples/rag_eval_squad.py |  47 ++++++++
 examples/run_evaluation.py | 219 +++++++++++++++++++++++++++++++++++++
 2 files changed, 266 insertions(+)
 create mode 100644 examples/rag_eval_squad.py
 create mode 100644 examples/run_evaluation.py

diff --git a/examples/rag_eval_squad.py b/examples/rag_eval_squad.py
new file mode 100644
index 0000000000..3483eabe51
--- /dev/null
+++ b/examples/rag_eval_squad.py
@@ -0,0 +1,47 @@
+import json
+from collections import defaultdict
+from pathlib import Path
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+
+def aggregate_wiki_title(data, agg_wiki_title):
+    for idx, x in enumerate(data.iter(batch_size=1)):
+        if x["context"] not in agg_wiki_title[x["title"][0]]["context"]:
+            agg_wiki_title[x["title"][0]]["context"].append(x["context"])
+        agg_wiki_title[x["title"][0]]["question_answers"].append({"question": x["question"], "answers": x["answers"]})
+
+
+def main():
+    data_train = load_dataset("squad", split="train")
+    data_validation = load_dataset("squad", split="validation")
+    agg_wiki_title = defaultdict(lambda: {"context": [], "question_answers": [], "text": ""})
+    aggregate_wiki_title(data_train, agg_wiki_title)
+    aggregate_wiki_title(data_validation, agg_wiki_title)
+
+    # merge the context into a single document
+    for article in tqdm(agg_wiki_title.keys()):
+        agg_wiki_title[article]["text"] = "\n".join([x[0] for x in agg_wiki_title[article]["context"]])
+
+    # create documents
+    for article in agg_wiki_title.keys():
+        out_path = Path("transformed_squad/articles/")
+        out_path.mkdir(parents=True, exist_ok=True)
+        with open(f"{str(out_path)}/{article}.txt", "w") as f:
+            f.write(agg_wiki_title[article]["text"])
+
+    # create question/answers
+    questions = Path("transformed_squad/")
+    questions.mkdir(parents=True, exist_ok=True)
+    with open(f"{str(questions)}/questions.jsonl", "w") as f:
+        for article in agg_wiki_title.keys():
+            for entry in agg_wiki_title[article]["question_answers"]:
+                f.write(
+                    json.dumps({"question": entry["question"][0], "document": article, "answers": entry["answers"][0]})
+                    + "\n"
+                )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/run_evaluation.py b/examples/run_evaluation.py
new file mode 100644
index 0000000000..dd9136419b
--- /dev/null
+++ b/examples/run_evaluation.py
@@ -0,0 +1,219 @@
+import json
+import os
+import random
+from typing import List
+
+from haystack_integrations.components.evaluators.ragas import RagasEvaluator, RagasMetric
+
+from haystack import Document, Pipeline
+from haystack.components.builders import AnswerBuilder, PromptBuilder
+from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
+from haystack.components.evaluators import AnswerExactMatchEvaluator
+from haystack.components.generators import OpenAIGenerator
+from haystack.components.retrievers import InMemoryEmbeddingRetriever
+from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.document_stores.types import DuplicatePolicy
+
+
+def load_transformed_squad():
+    with open("transformed_squad/questions.jsonl", "r") as f:
+        questions = [json.loads(x) for x in f.readlines()]
+
+    def create_document(text: str, name: str):
+        return Document(content=text, meta={"name": name})
+
+    # walk through the files in the directory and transform each text file into a Document
+    documents = []
+    for root, dirs, files in os.walk("transformed_squad/articles/"):
+        for article in files:
+            with open(f"{root}/{article}", "r") as f:
+                raw_texts = f.read().split("\n")
+                for text in raw_texts:
+                    documents.append(create_document(text, article.replace(".txt", "")))
+
+    return questions, documents
+
+
+def indexing(documents: List[Document]):
+    document_store = InMemoryDocumentStore()
+
+    doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
+    doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
+
+    ingestion_pipe = Pipeline()
+    ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")
+    ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")
+
+    ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents")
+    ingestion_pipe.run({"doc_embedder": {"documents": documents}})
+
+    return document_store
+
+
+def run_evaluation():
+    template = """
+        Given the following information, answer the question.
+
+        Context:
+        {% for document in documents %}
+            {{ document.content }}
+        {% endfor %}
+
+        Question: {{question}}
+        Answer:
+        """
+
+    questions, documents = load_transformed_squad()
+    document_store = indexing(documents)
+
+    rag_pipeline_1 = Pipeline()
+    rag_pipeline_1.add_component(
+        "query_embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
+    )
+    rag_pipeline_1.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=2))
+    rag_pipeline_1.add_component("prompt_builder", PromptBuilder(template=template))
+    rag_pipeline_1.add_component("llm", OpenAIGenerator(model="gpt-3.5-turbo"))
+    rag_pipeline_1.add_component("answer_builder", AnswerBuilder())
+
+    rag_pipeline_1.connect("query_embedder", "retriever.query_embedding")
+    rag_pipeline_1.connect("retriever", "prompt_builder.documents")
+    rag_pipeline_1.connect("prompt_builder", "llm")
+    rag_pipeline_1.connect("llm.replies", "answer_builder.replies")
+    rag_pipeline_1.connect("llm.meta", "answer_builder.meta")
+    rag_pipeline_1.connect("retriever", "answer_builder.documents")
+
+    # select 5 random questions from the list of questions
+    for random_questions in random.sample(questions, 5):
+        question = random_questions["question"]
+        answer = random_questions["answers"]["text"]
+        print(f"Question: {question}")
+        print(f"Answer: {answer}")
+        print()
+        response = rag_pipeline_1.run(
+            {
+                "query_embedder": {"text": question},
+                "prompt_builder": {"question": question},
+                "answer_builder": {"query": question},
+            }
+        )
+        print("Answer from pipeline:")
+        print(response["answer_builder"]["answers"][0].data)
+        print("\n")
+
+
+def seven_wonders():
+    template = """
+    Given the following information, answer the question.
+
+    Context:
+    {% for document in documents %}
+        {{ document.content }}
+    {% endfor %}
+
+    Question: {{question}}
+    Answer:
+    """
+
+    questions, documents = load_transformed_squad()
+    document_store = InMemoryDocumentStore()
+    document_store.write_documents(documents)
+
+    rag_pipeline_1 = Pipeline()
+    rag_pipeline_1.add_component("retriever", InMemoryBM25Retriever(document_store, top_k=10))
+    rag_pipeline_1.add_component("prompt_builder", PromptBuilder(template=template))
+    rag_pipeline_1.add_component("llm", OpenAIGenerator(model="gpt-3.5-turbo"))
+    rag_pipeline_1.add_component(instance=AnswerBuilder(), name="answer_builder")
+    rag_pipeline_1.connect("retriever", "prompt_builder.documents")
+    rag_pipeline_1.connect("prompt_builder", "llm")
+    rag_pipeline_1.connect("llm.replies", "answer_builder.replies")
+    rag_pipeline_1.connect("llm.meta", "answer_builder.meta")
+    rag_pipeline_1.connect("retriever", "answer_builder.documents")
+
+    document_store = InMemoryDocumentStore()
+    document_store.write_documents(documents)
+    rag_pipeline_2 = Pipeline()
+    rag_pipeline_2.add_component("retriever", InMemoryBM25Retriever(document_store, top_k=1))
+    rag_pipeline_2.add_component("prompt_builder", PromptBuilder(template=template))
+    rag_pipeline_2.add_component("llm", OpenAIGenerator(model="gpt-3.5-turbo"))
+    rag_pipeline_2.add_component(instance=AnswerBuilder(), name="answer_builder")
+    rag_pipeline_2.connect("retriever", "prompt_builder.documents")
+    rag_pipeline_2.connect("prompt_builder", "llm")
+    rag_pipeline_2.connect("llm.replies", "answer_builder.replies")
+    rag_pipeline_2.connect("llm.meta", "answer_builder.meta")
+    rag_pipeline_2.connect("retriever", "answer_builder.documents")
+
+    question = "What does Rhodes Statue look like?"
+    for pipeline in [rag_pipeline_1, rag_pipeline_2]:
+        response = pipeline.run(
+            {
+                "retriever": {"query": question},
+                "prompt_builder": {"question": question},
+                "answer_builder": {"query": question},
+            }
+        )
+        print(response["answer_builder"]["answers"][0])
+
+    evaluator_exact_match = AnswerExactMatchEvaluator()
+
+    eval_pipeline = Pipeline()
+    evaluator_context = RagasEvaluator(metric=RagasMetric.CONTEXT_PRECISION)
+    evaluator_aspect = RagasEvaluator(
+        metric=RagasMetric.ASPECT_CRITIQUE,
+        metric_params={"name": "custom", "definition": "Is this answer problematic for children?", "strictness": 3},
+    )
+    eval_pipeline.add_component("evaluator_context", evaluator_context)
+    eval_pipeline.add_component("evaluator_aspect", evaluator_aspect)
+    eval_pipeline.add_component("evaluator_exact_match", evaluator_exact_match)
+
+    questions = [
+        "Where is Gardens of Babylon?",
+        "Why did people build Great Pyramid of Giza?",
+        # "What does Rhodes Statue look like?",
+        # "Why did people visit the Temple of Artemis?",
+        # "What is the importance of Colossus of Rhodes?",
+        # "What happened to the Tomb of Mausolus?",
+        # "How did Colossus of Rhodes collapse?",
+    ]
+    ground_truths = [
+        "The gardens of Babylon were in the ancient city of Babylon, near present-day Hillah, Babil province, in Iraq.",
+        "The pyramids of Giza were constructed to honor the pharaoh and to serve as his tomb after death.",
+    ]
+
+    for pipeline in [rag_pipeline_1, rag_pipeline_2]:
+        contexts = []
+        responses = []
+        for question in questions:
+            results = pipeline.run(
+                {
+                    "retriever": {"query": question},
+                    "prompt_builder": {"question": question},
+                    "answer_builder": {"query": question},
+                }
+            )
+
+            context = [doc.content for doc in results["answer_builder"]["answers"][0].documents]
+            response = results["answer_builder"]["answers"][0].data
+            contexts.append(context)
+            responses.append(response)
+
+        results = eval_pipeline.run(
+            {
+                "evaluator_context": {"questions": questions, "contexts": contexts, "ground_truths": ground_truths},
+                "evaluator_aspect": {"questions": questions, "contexts": contexts, "responses": responses},
+                "evaluator_exact_match": {
+                    "questions": questions,
+                    "ground_truth_answers": ground_truths,
+                    "predicted_answers": responses,
+                },
+            }
+        )
+        print(results)
+
+        # Users can also run evaluator components individually outside of a pipeline
+        evaluator = AnswerExactMatchEvaluator()
+        exact_match_result = evaluator.run(
+            questions=questions, ground_truth_answers=ground_truths, predicted_answers=responses
+        )
+        print(exact_match_result["result"])

From ecb0165921e537a6a2e6f3f135b4d3692554c516 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Thu, 4 Apr 2024 09:50:42 +0200
Subject: [PATCH 02/15] adding proposal in md

---
 proposals/text/0000-rag-evaluation.md    | 225 +++++++++++++++++++++++
 proposals/text/0000-rag_pipeline_eval.py | 179 ++++++++++++++++++
 2 files changed, 404 insertions(+)
 create mode 100644 proposals/text/0000-rag-evaluation.md
 create mode 100644 proposals/text/0000-rag_pipeline_eval.py

diff --git a/proposals/text/0000-rag-evaluation.md b/proposals/text/0000-rag-evaluation.md
new file mode 100644
index 0000000000..613143cd9d
--- /dev/null
+++ b/proposals/text/0000-rag-evaluation.md
@@ -0,0 +1,225 @@
+- Title: Proposal for presentation of RAG evaluation results
+- Decision driver: David S. Batista
+- Start Date: 2024-04-03
+- Proposal PR: (fill in after opening the PR)
+- Github Issue or Discussion: (only if available, link the original request for this change)
+
+# Summary
+
+Add a new component to Haystack allowing users to evaluate the performance of a RAG model.
+
+# Basic example
+
+```python
+
+data  = {
+    "query_id": ["53c3b3e6, 225f87f7, 8ac473ec, 97d284ca"],
+    "reciprocal_rank": [0.378064, 0.534964, 0.216058, 0.778642],
+    "single_hit": [1, 1, 0, 1],
+    "multi_hit": [0.706125, 0.454976, 0.445512, 0.250522],
+    "context_relevance": [0.805466, 0.410251, 0.750070, 0.361332],
+    "faithfulness": [0.135581, 0.695974, 0.749861, 0.041999],
+    "semantic_answer_similarity": [0.971241, 0.159320, 0.019722, 1],
+    "exact_match": [0, 0, 0, 1]
+}
+
+evaluation = RAGPipelineEvaluation(name="pipeline_1", data=data)
+evaluation.classification_report()
+```
+
+```bash
+{'Reciprocal Rank': 0.448,
+ 'Single Hit': 0.5,
+ 'Multi Hit': 0.540,
+ 'Context Relevance': 0.537,
+ 'Faithfulness': 0.452,
+ 'Semantic Answer Similarity': 0.478,
+ 'Exact Match': 0.442}
+````
+
+# Motivation
+
+RAG models are one of them most popular use cases for Haystack. We are adding support for evaluations metrics, but
+there is no way to present the results of the evaluation.
+
+
+# Detailed design
+
+A new class  `RAGPipelineEvaluation` that receives the results of some or all metric scores over a set of queries/questions
+given to a RAG model.
+
+Example:
+
+```
+data  = {
+    "query_id": ["53c3b3e6, 225f87f7, 8ac473ec, 97d284ca"],
+    "reciprocal_rank": [0.378064, 0.534964, 0.216058, 0.778642],
+    "single_hit": [1, 1, 0, 1],
+    "multi_hit": [0.706125, 0.454976, 0.445512, 0.250522],
+    "context_relevance": [0.805466, 0.410251, 0.750070, 0.361332],
+    "faithfulness": [0.135581, 0.695974, 0.749861, 0.041999],
+    "semantic_answer_similarity": [0.971241, 0.159320, 0.019722, 1],
+    "exact_match": [0, 0, 0, 1]
+}
+```
+
+These scores are computed using features already available in Haystack, in the `evaluators` module.
+
+`RAGPipelineEvaluation` stores internally this data as a `pd.DataFrame` and provides methods to operate on it presenting
+results of the evaluation.
+
+```python
+class RAGPipelineEvaluation:
+
+    def __init__(self, name: str, data: Union[pd.DataFrame, List[Dict[str, Union[str, float]]]]):
+        self.name = name
+        self.data = self._get_mocked_dataframe_single_k_value(n_queries=50)  # this is just to have numbers to show
+
+    @staticmethod
+    def _get_mocked_dataframe_single_k_value(n_queries: int):
+        """
+        Generate a mocked dataframe for evaluation purposes.
+
+        - Reciprocal Rank: 1 / rank of the first correct answer - range [0, 1]
+        - Single Hit: 1 if the first retrieved document is correct, 0 otherwise - binary
+        - Multi Hit: proportion of correct documents in the top k retrieved documents - range [0,1]
+
+        - Context Relevance:
+           for a given query q:
+            - the system first retrieves some context c(q) and then generates an answer a(q)
+            - the context relevance is the number of extracted sentences / number of sentences in the context c(q)
+            - [0,1]
+
+        - Faithfulness:
+            - we say that the answer as(q) is faithful to the context c(q) if the claims that are made in the answer
+              can be inferred from the context.
+            - |V| number of statements that were supported according to the LLM
+            - |S| is the total number of statements.
+            - Faithfulness = |V| / |S|
+            - [0,1]
+
+        - Semantic Answer Similarity: cosine similarity between the generated answer and the correct answer - range [0,1]
+        - Exact Match: 1 if the generated answer is exactly the same as the correct answer, 0 otherwise - binary
+        """
+
+        columns = ['query_id', 'reciprocal rank', 'single hit', 'multi hit', 'context relevance', 'faithfulness',
+                   'semantic answer similarity', 'exact match']
+
+        query_id = [str(uuid.uuid4()) for _ in range(n_queries)]
+        reciprocal_rank = [random() for _ in range(n_queries)]
+        single_hit = [randint(0, 1) for _ in range(n_queries)]
+        multi_hit = [random() for _ in range(n_queries)]
+        context_relevance = [random() for _ in range(n_queries)]
+        faithfulness = [random() for _ in range(n_queries)]
+        semantic_similarity = [random() for _ in range(n_queries)]
+        exact_match = [randint(0, 1) for _ in range(n_queries)]
+
+        values = list(
+            zip(query_id, reciprocal_rank, single_hit, multi_hit, context_relevance,
+                faithfulness, semantic_similarity, exact_match)
+        )
+
+        return pd.DataFrame(values, columns=columns)
+
+    def evaluation_report(self) -> Dict[str, float]:
+        """Get the classification report for the different metrics"""
+
+        mrr = self.get_aggregated_scores('reciprocal rank')
+        single_hit = self.get_aggregated_scores('single hit')
+        multi_hit = self.get_aggregated_scores('multi hit')
+        faithfulness = self.get_aggregated_scores('faithfulness')
+        context_relevance = self.get_aggregated_scores('context relevance')
+        semantic_similarity = self.get_aggregated_scores('semantic answer similarity')
+        exact_match = self.get_aggregated_scores('exact match')
+        correct_queries = self.data[self.data['exact match'] == 1].shape[0]
+
+        return {
+            'Reciprocal Rank': mrr,
+            'Single Hit': single_hit,
+            'Multi Hit': multi_hit,
+            'Context Relevance': context_relevance,
+            'Faithfulness': faithfulness,
+            'Semantic Answer Similarity': semantic_similarity,
+            'Exact Match': exact_match,
+            'nr_correct_queries': correct_queries,
+            'nr_incorrect_queries': self.data.shape[0] - correct_queries,
+        }
+
+    def get_aggregated_scores(self, metric: str) -> float:
+        if metric in ['reciprocal rank', 'multi hit', 'context relevance', 'semantic answer similarity']:
+            return self.data[metric].mean()
+        if metric in ['single hit', 'exact match']:
+            return self.data[metric].sum() / len(self.data)
+
+    def get_detailed_scores(self, metric: str, query_ids: List[str]) -> pd.DataFrame:
+        """Get the detailed scores for all queries or a for a subset of the queries for a given metric"""
+        pass
+
+    def find_thresholds(self, metrics: List[str]) -> Dict[str, float]:
+        """
+        Use the `statistics` module to find the thresholds for the different metrics.
+
+        Some potentially interesting thresholds to find:
+            - the 25th percentile
+            - the 75th percentile
+            - the mean
+            - the median
+        """
+        pass
+
+    def get_scores_below_threshold(self, metric: str, threshold: float):
+        """Get the all the queries with a score below a certain threshold for a given metric"""
+        return self.data[self.data[metric] < threshold]
+
+    def comparative_detailed_summary(self, other: "PipelineEvaluationAPI") -> pd.DataFrame:
+        """
+        - Queries that are answered correctly by both pipelines
+        - Queries that are answered incorrectly by both pipelines
+        - Queries that are answered correctly by only one pipeline
+        """
+
+        # correct by both pipelines
+        both_correct = self.data[(self.data['exact match'] == 1) & (other.data['exact match'] == 1)]['query_id'].tolist()
+
+        # incorrectly by both pipelines
+        both_incorrect = self.data[(self.data['exact match'] == 0) & (other.data['exact match'] == 0)]['query_id'].tolist()
+
+        # queries that are answered correctly by only one pipeline
+        only_this_correct = self.data[(self.data['exact match'] == 1) & (other.data['exact match'] == 0)]['query_id'].tolist()
+        only_other_correct = self.data[(self.data['exact match'] == 0) & (other.data['exact match'] == 1)]['query_id'].tolist()
+
+        columns = ["both_correct", "both_incorrect", f"only_{self.name}_correct", f"only_{other.name}_correct"]
+
+        # make all lists the same length, fill with None, so that we can create a DataFrame
+        max_len = max(len(both_correct), len(both_incorrect))
+        both_correct += ["None"] * (max_len - len(both_correct))
+        both_incorrect += ["None"] * (max_len - len(both_incorrect))
+        only_this_correct += ["None"] * (max_len - len(only_this_correct))
+        only_other_correct += ["None"] * (max_len - len(only_other_correct))
+
+        values = list(zip(both_correct, both_incorrect, only_this_correct, only_other_correct))
+
+        return pd.DataFrame(values, columns=columns)
+
+```
+
+
+# Drawbacks
+
+- Relying on pandas DataFrame internally makes it easy to perform many of the operations.
+- Nevertheless, it can be burden, since we are making `pandas` a dependency of `haystack-ai`.
+- Ideally all the proposed methods should be implemented in a way that doesn't require `pandas`.
+
+
+# Adoption strategy
+
+- Doesn't introduce any breaking change, it is a new feature that can be adopted by users as they see fit for their use cases.
+
+# How we teach this
+
+- A tutorial would be the best approach to teach users how to use this feature.
+- Adding a new entry to the documentation.
+
+# Unresolved questions
+
+- The `comparative_summary()` and the `comparative_detailed_summary()` methods need to be adopted to different definitions of what a correct answer is.
diff --git a/proposals/text/0000-rag_pipeline_eval.py b/proposals/text/0000-rag_pipeline_eval.py
new file mode 100644
index 0000000000..ca891ab09a
--- /dev/null
+++ b/proposals/text/0000-rag_pipeline_eval.py
@@ -0,0 +1,179 @@
+import uuid
+from random import randint, random
+from typing import Dict, List, Union
+
+import pandas as pd
+
+"""
+This proposal is a good summary of the scenarios there are to evaluate RAG pipelines and the metrics involved.
+
+Having the issue in mind https://github.com/deepset-ai/haystack/issues/7398,
+
+I was expecting this proposal to be more about output format (dataframe, csv?),
+
+what are columns (metrics?),
+what is each row (one query?),
+
+will there be an evaluation report with worst and best predictions,
+
+and how will presentation of evaluation results differ from 1.x (https://docs.haystack.deepset.ai/v1.26-unstable/docs/evaluation).
+
+Including a short description of the metrics like here definitely helps discussing the proposal.
+For metric descriptions, you can also refer to https://www.notion.so/deepsetai/Metrics-03d08dbafef24075924bb3b7b87aae5d.
+"""
+
+
+class RAGPipelineEvaluation:
+    def __init__(self, name: str, data: Union[pd.DataFrame, List[Dict[str, Union[str, float]]]]):
+        self.name = name
+        self.data = self._get_mocked_dataframe_single_k_value(n_queries=50)  # this is just to have numbers to show
+
+    @staticmethod
+    def _get_mocked_dataframe_single_k_value(n_queries: int):
+        """
+        Generate a mocked dataframe for evaluation purposes.
+
+        - Reciprocal Rank: 1 / rank of the first correct answer - range [0, 1]
+        - Single Hit: 1 if the first retrieved document is correct, 0 otherwise - binary
+        - Multi Hit: proportion of correct documents in the top k retrieved documents - range [0,1]
+
+        - Context Relevance:
+           for a given query q:
+            - the system first retrieves some context c(q) and then generates an answer a(q)
+            - the context relevance is the number of extracted sentences / number of sentences in the context c(q)
+            - [0,1]
+
+        - Faithfulness:
+            - we say that the answer as(q) is faithful to the context c(q) if the claims that are made in the answer
+              can be inferred from the context.
+            - |V| number of statements that were supported according to the LLM
+            - |S| is the total number of statements.
+            - Faithfulness = |V| / |S|
+            - [0,1]
+
+        - Semantic Answer Similarity: cosine similarity between the generated answer and the correct answer - range [0,1]
+        - Exact Match: 1 if the generated answer is exactly the same as the correct answer, 0 otherwise - binary
+        """
+
+        columns = [
+            "query_id",
+            "reciprocal rank",
+            "single hit",
+            "multi hit",
+            "context relevance",
+            "faithfulness",
+            "semantic answer similarity",
+            "exact match",
+        ]
+
+        query_id = [str(uuid.uuid4()) for _ in range(n_queries)]
+        reciprocal_rank = [random() for _ in range(n_queries)]
+        single_hit = [randint(0, 1) for _ in range(n_queries)]
+        multi_hit = [random() for _ in range(n_queries)]
+        context_relevance = [random() for _ in range(n_queries)]
+        faithfulness = [random() for _ in range(n_queries)]
+        semantic_similarity = [random() for _ in range(n_queries)]
+        exact_match = [randint(0, 1) for _ in range(n_queries)]
+
+        values = list(
+            zip(
+                query_id,
+                reciprocal_rank,
+                single_hit,
+                multi_hit,
+                context_relevance,
+                faithfulness,
+                semantic_similarity,
+                exact_match,
+            )
+        )
+
+        return pd.DataFrame(values, columns=columns)
+
+    def evaluation_report(self) -> Dict[str, float]:
+        """Get the classification report for the different metrics"""
+
+        mrr = self.get_aggregated_scores("reciprocal rank")
+        single_hit = self.get_aggregated_scores("single hit")
+        multi_hit = self.get_aggregated_scores("multi hit")
+        faithfulness = self.get_aggregated_scores("faithfulness")
+        context_relevance = self.get_aggregated_scores("context relevance")
+        semantic_similarity = self.get_aggregated_scores("semantic answer similarity")
+        exact_match = self.get_aggregated_scores("exact match")
+        correct_queries = self.data[self.data["exact match"] == 1].shape[0]
+
+        return {
+            "Reciprocal Rank": mrr,
+            "Single Hit": single_hit,
+            "Multi Hit": multi_hit,
+            "Context Relevance": context_relevance,
+            "Faithfulness": faithfulness,
+            "Semantic Answer Similarity": semantic_similarity,
+            "Exact Match": exact_match,
+            "nr_correct_queries": correct_queries,
+            "nr_incorrect_queries": self.data.shape[0] - correct_queries,
+        }
+
+    def get_aggregated_scores(self, metric: str) -> float:
+        if metric in ["reciprocal rank", "multi hit", "context relevance", "semantic answer similarity"]:
+            return self.data[metric].mean()
+        if metric in ["single hit", "exact match"]:
+            return self.data[metric].sum() / len(self.data)
+
+    def get_detailed_scores(self, metric: str, query_ids: List[str]) -> pd.DataFrame:
+        """Get the detailed scores for all queries or a for a subset of the queries for a given metric"""
+        pass
+
+    def find_thresholds(self, metrics: List[str]) -> Dict[str, float]:
+        """
+        Use the `statistics` module to find the thresholds for the different metrics.
+
+        Some potentially interesting thresholds to find:
+            - the 25th percentile
+            - the 75th percentile
+            - the mean
+            - the median
+        """
+        pass
+
+    def get_scores_below_threshold(self, metric: str, threshold: float):
+        """Get the all the queries with a score below a certain threshold for a given metric"""
+        return self.data[self.data[metric] < threshold]
+
+    def comparative_detailed_summary(self, other: "RAGPipelineEvaluation") -> pd.DataFrame:
+        """
+        - Queries that are answered correctly by both pipelines
+        - Queries that are answered incorrectly by both pipelines
+        - Queries that are answered correctly by only one pipeline
+        """
+
+        # correct by both pipelines
+        both_correct = self.data[(self.data["exact match"] == 1) & (other.data["exact match"] == 1)][
+            "query_id"
+        ].tolist()
+
+        # incorrectly by both pipelines
+        both_incorrect = self.data[(self.data["exact match"] == 0) & (other.data["exact match"] == 0)][
+            "query_id"
+        ].tolist()
+
+        # queries that are answered correctly by only one pipeline
+        only_this_correct = self.data[(self.data["exact match"] == 1) & (other.data["exact match"] == 0)][
+            "query_id"
+        ].tolist()
+        only_other_correct = self.data[(self.data["exact match"] == 0) & (other.data["exact match"] == 1)][
+            "query_id"
+        ].tolist()
+
+        columns = ["both_correct", "both_incorrect", f"only_{self.name}_correct", f"only_{other.name}_correct"]
+
+        # make all lists the same length, fill with None, so that we can create a DataFrame
+        max_len = max(len(both_correct), len(both_incorrect))
+        both_correct += ["None"] * (max_len - len(both_correct))
+        both_incorrect += ["None"] * (max_len - len(both_incorrect))
+        only_this_correct += ["None"] * (max_len - len(only_this_correct))
+        only_other_correct += ["None"] * (max_len - len(only_other_correct))
+
+        values = list(zip(both_correct, both_incorrect, only_this_correct, only_other_correct))
+
+        return pd.DataFrame(values, columns=columns)

From 71207984f203764c5877deb0a343d2690d87df4c Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Thu, 4 Apr 2024 09:53:19 +0200
Subject: [PATCH 03/15] renaming proposal number

---
 proposals/text/{0000-rag-evaluation.md => 7462-rag-evaluation.md} | 0
 .../text/{0000-rag_pipeline_eval.py => 7462-rag_pipeline_eval.py} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename proposals/text/{0000-rag-evaluation.md => 7462-rag-evaluation.md} (100%)
 rename proposals/text/{0000-rag_pipeline_eval.py => 7462-rag_pipeline_eval.py} (100%)

diff --git a/proposals/text/0000-rag-evaluation.md b/proposals/text/7462-rag-evaluation.md
similarity index 100%
rename from proposals/text/0000-rag-evaluation.md
rename to proposals/text/7462-rag-evaluation.md
diff --git a/proposals/text/0000-rag_pipeline_eval.py b/proposals/text/7462-rag_pipeline_eval.py
similarity index 100%
rename from proposals/text/0000-rag_pipeline_eval.py
rename to proposals/text/7462-rag_pipeline_eval.py

From 8d476751bb155cadc845552c2ffada6028f554a3 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Thu, 4 Apr 2024 09:55:12 +0200
Subject: [PATCH 04/15] removing stuff

---
 examples/rag_eval_squad.py |  47 --------
 examples/run_evaluation.py | 219 -------------------------------------
 2 files changed, 266 deletions(-)
 delete mode 100644 examples/rag_eval_squad.py
 delete mode 100644 examples/run_evaluation.py

diff --git a/examples/rag_eval_squad.py b/examples/rag_eval_squad.py
deleted file mode 100644
index 3483eabe51..0000000000
--- a/examples/rag_eval_squad.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import json
-from collections import defaultdict
-from pathlib import Path
-
-from datasets import load_dataset
-from tqdm import tqdm
-
-
-def aggregate_wiki_title(data, agg_wiki_title):
-    for idx, x in enumerate(data.iter(batch_size=1)):
-        if x["context"] not in agg_wiki_title[x["title"][0]]["context"]:
-            agg_wiki_title[x["title"][0]]["context"].append(x["context"])
-        agg_wiki_title[x["title"][0]]["question_answers"].append({"question": x["question"], "answers": x["answers"]})
-
-
-def main():
-    data_train = load_dataset("squad", split="train")
-    data_validation = load_dataset("squad", split="validation")
-    agg_wiki_title = defaultdict(lambda: {"context": [], "question_answers": [], "text": ""})
-    aggregate_wiki_title(data_train, agg_wiki_title)
-    aggregate_wiki_title(data_validation, agg_wiki_title)
-
-    # merge the context into a single document
-    for article in tqdm(agg_wiki_title.keys()):
-        agg_wiki_title[article]["text"] = "\n".join([x[0] for x in agg_wiki_title[article]["context"]])
-
-    # create documents
-    for article in agg_wiki_title.keys():
-        out_path = Path("transformed_squad/articles/")
-        out_path.mkdir(parents=True, exist_ok=True)
-        with open(f"{str(out_path)}/{article}.txt", "w") as f:
-            f.write(agg_wiki_title[article]["text"])
-
-    # create question/answers
-    questions = Path("transformed_squad/")
-    questions.mkdir(parents=True, exist_ok=True)
-    with open(f"{str(questions)}/questions.jsonl", "w") as f:
-        for article in agg_wiki_title.keys():
-            for entry in agg_wiki_title[article]["question_answers"]:
-                f.write(
-                    json.dumps({"question": entry["question"][0], "document": article, "answers": entry["answers"][0]})
-                    + "\n"
-                )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/run_evaluation.py b/examples/run_evaluation.py
deleted file mode 100644
index dd9136419b..0000000000
--- a/examples/run_evaluation.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import json
-import os
-import random
-from typing import List
-
-from haystack_integrations.components.evaluators.ragas import RagasEvaluator, RagasMetric
-
-from haystack import Document, Pipeline
-from haystack.components.builders import AnswerBuilder, PromptBuilder
-from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
-from haystack.components.evaluators import AnswerExactMatchEvaluator
-from haystack.components.generators import OpenAIGenerator
-from haystack.components.retrievers import InMemoryEmbeddingRetriever
-from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
-from haystack.components.writers import DocumentWriter
-from haystack.document_stores.in_memory import InMemoryDocumentStore
-from haystack.document_stores.types import DuplicatePolicy
-
-
-def load_transformed_squad():
-    with open("transformed_squad/questions.jsonl", "r") as f:
-        questions = [json.loads(x) for x in f.readlines()]
-
-    def create_document(text: str, name: str):
-        return Document(content=text, meta={"name": name})
-
-    # walk through the files in the directory and transform each text file into a Document
-    documents = []
-    for root, dirs, files in os.walk("transformed_squad/articles/"):
-        for article in files:
-            with open(f"{root}/{article}", "r") as f:
-                raw_texts = f.read().split("\n")
-                for text in raw_texts:
-                    documents.append(create_document(text, article.replace(".txt", "")))
-
-    return questions, documents
-
-
-def indexing(documents: List[Document]):
-    document_store = InMemoryDocumentStore()
-
-    doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
-    doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
-
-    ingestion_pipe = Pipeline()
-    ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")
-    ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")
-
-    ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents")
-    ingestion_pipe.run({"doc_embedder": {"documents": documents}})
-
-    return document_store
-
-
-def run_evaluation():
-    template = """
-        Given the following information, answer the question.
-
-        Context:
-        {% for document in documents %}
-            {{ document.content }}
-        {% endfor %}
-
-        Question: {{question}}
-        Answer:
-        """
-
-    questions, documents = load_transformed_squad()
-    document_store = indexing(documents)
-
-    rag_pipeline_1 = Pipeline()
-    rag_pipeline_1.add_component(
-        "query_embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
-    )
-    rag_pipeline_1.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=2))
-    rag_pipeline_1.add_component("prompt_builder", PromptBuilder(template=template))
-    rag_pipeline_1.add_component("llm", OpenAIGenerator(model="gpt-3.5-turbo"))
-    rag_pipeline_1.add_component("answer_builder", AnswerBuilder())
-
-    rag_pipeline_1.connect("query_embedder", "retriever.query_embedding")
-    rag_pipeline_1.connect("retriever", "prompt_builder.documents")
-    rag_pipeline_1.connect("prompt_builder", "llm")
-    rag_pipeline_1.connect("llm.replies", "answer_builder.replies")
-    rag_pipeline_1.connect("llm.meta", "answer_builder.meta")
-    rag_pipeline_1.connect("retriever", "answer_builder.documents")
-
-    # select 5 random questions from the list of questions
-    for random_questions in random.sample(questions, 5):
-        question = random_questions["question"]
-        answer = random_questions["answers"]["text"]
-        print(f"Question: {question}")
-        print(f"Answer: {answer}")
-        print()
-        response = rag_pipeline_1.run(
-            {
-                "query_embedder": {"text": question},
-                "prompt_builder": {"question": question},
-                "answer_builder": {"query": question},
-            }
-        )
-        print("Answer from pipeline:")
-        print(response["answer_builder"]["answers"][0].data)
-        print("\n")
-
-
-def seven_wonders():
-    template = """
-    Given the following information, answer the question.
-
-    Context:
-    {% for document in documents %}
-        {{ document.content }}
-    {% endfor %}
-
-    Question: {{question}}
-    Answer:
-    """
-
-    questions, documents = load_transformed_squad()
-    document_store = InMemoryDocumentStore()
-    document_store.write_documents(documents)
-
-    rag_pipeline_1 = Pipeline()
-    rag_pipeline_1.add_component("retriever", InMemoryBM25Retriever(document_store, top_k=10))
-    rag_pipeline_1.add_component("prompt_builder", PromptBuilder(template=template))
-    rag_pipeline_1.add_component("llm", OpenAIGenerator(model="gpt-3.5-turbo"))
-    rag_pipeline_1.add_component(instance=AnswerBuilder(), name="answer_builder")
-    rag_pipeline_1.connect("retriever", "prompt_builder.documents")
-    rag_pipeline_1.connect("prompt_builder", "llm")
-    rag_pipeline_1.connect("llm.replies", "answer_builder.replies")
-    rag_pipeline_1.connect("llm.meta", "answer_builder.meta")
-    rag_pipeline_1.connect("retriever", "answer_builder.documents")
-
-    document_store = InMemoryDocumentStore()
-    document_store.write_documents(documents)
-    rag_pipeline_2 = Pipeline()
-    rag_pipeline_2.add_component("retriever", InMemoryBM25Retriever(document_store, top_k=1))
-    rag_pipeline_2.add_component("prompt_builder", PromptBuilder(template=template))
-    rag_pipeline_2.add_component("llm", OpenAIGenerator(model="gpt-3.5-turbo"))
-    rag_pipeline_2.add_component(instance=AnswerBuilder(), name="answer_builder")
-    rag_pipeline_2.connect("retriever", "prompt_builder.documents")
-    rag_pipeline_2.connect("prompt_builder", "llm")
-    rag_pipeline_2.connect("llm.replies", "answer_builder.replies")
-    rag_pipeline_2.connect("llm.meta", "answer_builder.meta")
-    rag_pipeline_2.connect("retriever", "answer_builder.documents")
-
-    question = "What does Rhodes Statue look like?"
-    for pipeline in [rag_pipeline_1, rag_pipeline_2]:
-        response = pipeline.run(
-            {
-                "retriever": {"query": question},
-                "prompt_builder": {"question": question},
-                "answer_builder": {"query": question},
-            }
-        )
-        print(response["answer_builder"]["answers"][0])
-
-    evaluator_exact_match = AnswerExactMatchEvaluator()
-
-    eval_pipeline = Pipeline()
-    evaluator_context = RagasEvaluator(metric=RagasMetric.CONTEXT_PRECISION)
-    evaluator_aspect = RagasEvaluator(
-        metric=RagasMetric.ASPECT_CRITIQUE,
-        metric_params={"name": "custom", "definition": "Is this answer problematic for children?", "strictness": 3},
-    )
-    eval_pipeline.add_component("evaluator_context", evaluator_context)
-    eval_pipeline.add_component("evaluator_aspect", evaluator_aspect)
-    eval_pipeline.add_component("evaluator_exact_match", evaluator_exact_match)
-
-    questions = [
-        "Where is Gardens of Babylon?",
-        "Why did people build Great Pyramid of Giza?",
-        # "What does Rhodes Statue look like?",
-        # "Why did people visit the Temple of Artemis?",
-        # "What is the importance of Colossus of Rhodes?",
-        # "What happened to the Tomb of Mausolus?",
-        # "How did Colossus of Rhodes collapse?",
-    ]
-    ground_truths = [
-        "The gardens of Babylon were in the ancient city of Babylon, near present-day Hillah, Babil province, in Iraq.",
-        "The pyramids of Giza were constructed to honor the pharaoh and to serve as his tomb after death.",
-    ]
-
-    for pipeline in [rag_pipeline_1, rag_pipeline_2]:
-        contexts = []
-        responses = []
-        for question in questions:
-            results = pipeline.run(
-                {
-                    "retriever": {"query": question},
-                    "prompt_builder": {"question": question},
-                    "answer_builder": {"query": question},
-                }
-            )
-
-            context = [doc.content for doc in results["answer_builder"]["answers"][0].documents]
-            response = results["answer_builder"]["answers"][0].data
-            contexts.append(context)
-            responses.append(response)
-
-        results = eval_pipeline.run(
-            {
-                "evaluator_context": {"questions": questions, "contexts": contexts, "ground_truths": ground_truths},
-                "evaluator_aspect": {"questions": questions, "contexts": contexts, "responses": responses},
-                "evaluator_exact_match": {
-                    "questions": questions,
-                    "ground_truth_answers": ground_truths,
-                    "predicted_answers": responses,
-                },
-            }
-        )
-        print(results)
-
-        # Users can also run evaluator components individually outside of a pipeline
-        evaluator = AnswerExactMatchEvaluator()
-        exact_match_result = evaluator.run(
-            questions=questions, ground_truth_answers=ground_truths, predicted_answers=responses
-        )
-        print(exact_match_result["result"])

From 93229d0ec71c111ef821c0319eb218894d5178a7 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Thu, 4 Apr 2024 12:37:51 +0200
Subject: [PATCH 05/15] cleaning up

---
 proposals/text/7462-rag_pipeline_eval.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/proposals/text/7462-rag_pipeline_eval.py b/proposals/text/7462-rag_pipeline_eval.py
index ca891ab09a..bf58f22528 100644
--- a/proposals/text/7462-rag_pipeline_eval.py
+++ b/proposals/text/7462-rag_pipeline_eval.py
@@ -4,24 +4,6 @@
 
 import pandas as pd
 
-"""
-This proposal is a good summary of the scenarios there are to evaluate RAG pipelines and the metrics involved.
-
-Having the issue in mind https://github.com/deepset-ai/haystack/issues/7398,
-
-I was expecting this proposal to be more about output format (dataframe, csv?),
-
-what are columns (metrics?),
-what is each row (one query?),
-
-will there be an evaluation report with worst and best predictions,
-
-and how will presentation of evaluation results differ from 1.x (https://docs.haystack.deepset.ai/v1.26-unstable/docs/evaluation).
-
-Including a short description of the metrics like here definitely helps discussing the proposal.
-For metric descriptions, you can also refer to https://www.notion.so/deepsetai/Metrics-03d08dbafef24075924bb3b7b87aae5d.
-"""
-
 
 class RAGPipelineEvaluation:
     def __init__(self, name: str, data: Union[pd.DataFrame, List[Dict[str, Union[str, float]]]]):

From 32de96b7a42cb84b8853a13327bbb93d53b02618 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Fri, 5 Apr 2024 15:59:25 +0200
Subject: [PATCH 06/15] adding PR number and issue

---
 proposals/text/7462-rag-evaluation.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/proposals/text/7462-rag-evaluation.md b/proposals/text/7462-rag-evaluation.md
index 613143cd9d..0613468d2a 100644
--- a/proposals/text/7462-rag-evaluation.md
+++ b/proposals/text/7462-rag-evaluation.md
@@ -1,8 +1,8 @@
 - Title: Proposal for presentation of RAG evaluation results
 - Decision driver: David S. Batista
 - Start Date: 2024-04-03
-- Proposal PR: (fill in after opening the PR)
-- Github Issue or Discussion: (only if available, link the original request for this change)
+- Proposal PR: #7462
+- Github Issue or Discussion: https://github.com/deepset-ai/haystack/issues/7398
 
 # Summary
 

From e1aa59df790427687ddfc3283d56fd4146c29601 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Mon, 8 Apr 2024 14:41:34 +0200
Subject: [PATCH 07/15] updating proposal

---
 proposals/text/7462-rag-evaluation.md    | 333 ++++++++++++-----------
 proposals/text/7462-rag_pipeline_eval.py | 161 -----------
 2 files changed, 172 insertions(+), 322 deletions(-)
 delete mode 100644 proposals/text/7462-rag_pipeline_eval.py

diff --git a/proposals/text/7462-rag-evaluation.md b/proposals/text/7462-rag-evaluation.md
index 0613468d2a..fb5a4b8e31 100644
--- a/proposals/text/7462-rag-evaluation.md
+++ b/proposals/text/7462-rag-evaluation.md
@@ -6,12 +6,90 @@
 
 # Summary
 
-Add a new component to Haystack allowing users to evaluate the performance of a RAG model.
+Add a new component to Haystack allowing users interact with the results of evaluating the performance of a RAG model.
 
-# Basic example
+
+# Motivation
+
+RAG models are one of them most popular use cases for Haystack. We are adding support for evaluations metrics, but there is no way to present the results of the evaluation.
+
+
+# Detailed design
+
+The output results of an evaluation pipeline composed of `evaluator` components are passed to a `RAGPipelineEvaluation`
+(this is a placeholder name) which stores them internally and acts as an interface to access and present the results.
+
+Example of the data structure that the `RAGPipelineEvaluation` class will receive for initialization:
+
+```python
+
+data = {
+    "queries": {
+        "query_id": ["53c3b3e6", "225f87f7"],
+        "question": ["What is the capital of France?", "What is the capital of Spain?"],
+        "contexts": ["wiki_France", "wiki_Spain"],
+        "answer": ["Paris", "Madrid"]
+    },
+    "metrics":
+        [
+            {"name": "reciprocal_rank", "scores": [0.378064, 0.534964, 0.216058, 0.778642]},
+            {"name": "single_hit", "scores": [1, 1, 0, 1]},
+            {"name": "multi_hit", "scores": [0.706125, 0.454976, 0.445512, 0.250522]},
+            {"name": "context_relevance", "scores": [0.805466, 0.410251, 0.750070, 0.361332]},
+            {"name": "faithfulness", "scores": [0.135581, 0.695974, 0.749861, 0.041999]},
+            {"name": "semantic_answer_similarity", "scores": [0.971241, 0.159320, 0.019722, 1]}
+         ]
+    }
+```
+
+- At least the `query_id` or the `question and context` should be present in the data structure.
+- At least one of the metrics should be present in the data structure.
+
+
+The `RAGPipelineEvaluation` class provides the following methods to different types of users:
+
+Basic users:
+- `evaluation_report()`
+- `comparative_evaluation_summary()`
+
+Intermediate users:
+- `detailed_evaluation_report()`
+- `comparative_detailed_evaluation_report()`
+
+Advanced users:
+- `find_thresholds()`
+- `find_scores_below_threshold()`
+
+
+### Methods description
+An evaluation report that provides a summary of the performance of the model across all queries, showing the
+aggregated scores for all available metrics.
+
+```python
+def evaluation_report():
+```
+
+Example output
+
+```bash
+{'Reciprocal Rank': 0.448,
+ 'Single Hit': 0.5,
+ 'Multi Hit': 0.540,
+ 'Context Relevance': 0.537,
+ 'Faithfulness': 0.452,
+ 'Semantic Answer Similarity': 0.478
+ }
+ ```
+
+A detailed evaluation report that provides the scores of all available metrics for all queries or a subset of queries.
 
 ```python
+def get_detailed_scores(query_ids: Union[List[str], str] = "all"):
+```
+
+Example output
 
+```bash
 data  = {
     "query_id": ["53c3b3e6, 225f87f7, 8ac473ec, 97d284ca"],
     "reciprocal_rank": [0.378064, 0.534964, 0.216058, 0.778642],
@@ -20,187 +98,120 @@ data  = {
     "context_relevance": [0.805466, 0.410251, 0.750070, 0.361332],
     "faithfulness": [0.135581, 0.695974, 0.749861, 0.041999],
     "semantic_answer_similarity": [0.971241, 0.159320, 0.019722, 1],
-    "exact_match": [0, 0, 0, 1]
+    "aggregated_score":
+      {
+          'Reciprocal Rank': 0.448,
+          'Single Hit': 0.5,
+          'Multi Hit': 0.540,
+          'Context Relevance': 0.537,
+          'Faithfulness': 0.452,
+          'Semantic Answer Similarity': 0.478
+      }
 }
+```
 
-evaluation = RAGPipelineEvaluation(name="pipeline_1", data=data)
-evaluation.classification_report()
+### Comparative Evaluation Report
+
+A comparative summary that compares the performance of the model with another model based on the aggregated scores
+for all available metrics.
+
+```python
+def comparative_summary(self, other: "RAGPipelineEvaluation"):
 ```
 
 ```bash
-{'Reciprocal Rank': 0.448,
- 'Single Hit': 0.5,
- 'Multi Hit': 0.540,
- 'Context Relevance': 0.537,
- 'Faithfulness': 0.452,
- 'Semantic Answer Similarity': 0.478,
- 'Exact Match': 0.442}
-````
+{
+    "model_1": {
+        'Reciprocal Rank': 0.448,
+        'Single Hit': 0.5,
+        'Multi Hit': 0.540,
+        'Context Relevance': 0.537,
+        'Faithfulness': 0.452,
+        'Semantic Answer Similarity': 0.478
+    },
+    "model_2": {
+        'Reciprocal Rank': 0.448,
+        'Single Hit': 0.5,
+        'Multi Hit': 0.540,
+        'Context Relevance': 0.537,
+        'Faithfulness': 0.452,
+        'Semantic Answer Similarity': 0.478
+    }
+}
 
-# Motivation
+```
 
-RAG models are one of them most popular use cases for Haystack. We are adding support for evaluations metrics, but
-there is no way to present the results of the evaluation.
+A detailed comparative summary that compares the performance of the model with another model based on the scores of all
+available metrics for all queries.
 
 
-# Detailed design
+```python
+def detailed_comparative_summary(self, other: "RAGPipelineEvaluation"):
+```
 
-A new class  `RAGPipelineEvaluation` that receives the results of some or all metric scores over a set of queries/questions
-given to a RAG model.
+```bash
+{
+    "pipeline_1": {
+        "query_id": ["53c3b3e6, 225f87f7, 8ac473ec, 97d284ca"],
+        "reciprocal_rank": [0.378064, 0.534964, 0.216058, 0.778642],
+        "single_hit": [1, 1, 0, 1],
+        "multi_hit": [0.706125, 0.454976, 0.445512, 0.250522],
+        "context_relevance": [0.805466, 0.410251, 0.750070, 0.361332],
+        "faithfulness": [0.135581, 0.695974, 0.749861, 0.041999],
+        "semantic_answer_similarity": [0.971241, 0.159320, 0.019722, 1],
+        "aggregated_score":
+          {
+              'Reciprocal Rank': 0.448,
+              'Single Hit': 0.5,
+              'Multi Hit': 0.540,
+              'Context Relevance': 0.537,
+              'Faithfulness': 0.452,
+              'Semantic Answer Similarity': 0.478
+          }
+    },
+    "pipeline_2": {
+        "query_id": ["53c3b3e6, 225f87f7, 8ac473ec, 97d284ca"],
+        "reciprocal_rank": [0.378064, 0.534964, 0.216058, 0.778642],
+        "single_hit": [1, 1, 0, 1],
+        "multi_hit": [0.706125, 0.454976, 0.445512, 0.250522],
+        "context_relevance": [0.805466, 0.410251, 0.750070, 0.361332],
+        "faithfulness": [0.135581, 0.695974, 0.749861, 0.041999],
+        "semantic_answer_similarity": [0.971241, 0.159320, 0.019722, 1],
+        "aggregated_score":
+          {
+              'Reciprocal Rank': 0.448,
+              'Single Hit': 0.5,
+              'Multi Hit': 0.540,
+              'Context Relevance': 0.537,
+              'Faithfulness': 0.452,
+              'Semantic Answer Similarity': 0.478
+          }
+      }
+}
+```
 
-Example:
+Have a method to find interesting scores thresholds, typically used for error analysis, for all metrics available.
+Some potentially interesting thresholds to find are: the 25th percentile, the 75th percentile, the mean , the median.
 
+```python
+def find_thresholds(self, metrics: List[str]) -> Dict[str, float]:
 ```
+
+```bash
 data  = {
-    "query_id": ["53c3b3e6, 225f87f7, 8ac473ec, 97d284ca"],
+    "thresholds": ["25th percentile", "75th percentile", "mean", "average"],
     "reciprocal_rank": [0.378064, 0.534964, 0.216058, 0.778642],
-    "single_hit": [1, 1, 0, 1],
-    "multi_hit": [0.706125, 0.454976, 0.445512, 0.250522],
     "context_relevance": [0.805466, 0.410251, 0.750070, 0.361332],
     "faithfulness": [0.135581, 0.695974, 0.749861, 0.041999],
     "semantic_answer_similarity": [0.971241, 0.159320, 0.019722, 1],
-    "exact_match": [0, 0, 0, 1]
 }
-```
-
-These scores are computed using features already available in Haystack, in the `evaluators` module.
+````
 
-`RAGPipelineEvaluation` stores internally this data as a `pd.DataFrame` and provides methods to operate on it presenting
-results of the evaluation.
+Then have another method that  
 
 ```python
-class RAGPipelineEvaluation:
-
-    def __init__(self, name: str, data: Union[pd.DataFrame, List[Dict[str, Union[str, float]]]]):
-        self.name = name
-        self.data = self._get_mocked_dataframe_single_k_value(n_queries=50)  # this is just to have numbers to show
-
-    @staticmethod
-    def _get_mocked_dataframe_single_k_value(n_queries: int):
-        """
-        Generate a mocked dataframe for evaluation purposes.
-
-        - Reciprocal Rank: 1 / rank of the first correct answer - range [0, 1]
-        - Single Hit: 1 if the first retrieved document is correct, 0 otherwise - binary
-        - Multi Hit: proportion of correct documents in the top k retrieved documents - range [0,1]
-
-        - Context Relevance:
-           for a given query q:
-            - the system first retrieves some context c(q) and then generates an answer a(q)
-            - the context relevance is the number of extracted sentences / number of sentences in the context c(q)
-            - [0,1]
-
-        - Faithfulness:
-            - we say that the answer as(q) is faithful to the context c(q) if the claims that are made in the answer
-              can be inferred from the context.
-            - |V| number of statements that were supported according to the LLM
-            - |S| is the total number of statements.
-            - Faithfulness = |V| / |S|
-            - [0,1]
-
-        - Semantic Answer Similarity: cosine similarity between the generated answer and the correct answer - range [0,1]
-        - Exact Match: 1 if the generated answer is exactly the same as the correct answer, 0 otherwise - binary
-        """
-
-        columns = ['query_id', 'reciprocal rank', 'single hit', 'multi hit', 'context relevance', 'faithfulness',
-                   'semantic answer similarity', 'exact match']
-
-        query_id = [str(uuid.uuid4()) for _ in range(n_queries)]
-        reciprocal_rank = [random() for _ in range(n_queries)]
-        single_hit = [randint(0, 1) for _ in range(n_queries)]
-        multi_hit = [random() for _ in range(n_queries)]
-        context_relevance = [random() for _ in range(n_queries)]
-        faithfulness = [random() for _ in range(n_queries)]
-        semantic_similarity = [random() for _ in range(n_queries)]
-        exact_match = [randint(0, 1) for _ in range(n_queries)]
-
-        values = list(
-            zip(query_id, reciprocal_rank, single_hit, multi_hit, context_relevance,
-                faithfulness, semantic_similarity, exact_match)
-        )
-
-        return pd.DataFrame(values, columns=columns)
-
-    def evaluation_report(self) -> Dict[str, float]:
-        """Get the classification report for the different metrics"""
-
-        mrr = self.get_aggregated_scores('reciprocal rank')
-        single_hit = self.get_aggregated_scores('single hit')
-        multi_hit = self.get_aggregated_scores('multi hit')
-        faithfulness = self.get_aggregated_scores('faithfulness')
-        context_relevance = self.get_aggregated_scores('context relevance')
-        semantic_similarity = self.get_aggregated_scores('semantic answer similarity')
-        exact_match = self.get_aggregated_scores('exact match')
-        correct_queries = self.data[self.data['exact match'] == 1].shape[0]
-
-        return {
-            'Reciprocal Rank': mrr,
-            'Single Hit': single_hit,
-            'Multi Hit': multi_hit,
-            'Context Relevance': context_relevance,
-            'Faithfulness': faithfulness,
-            'Semantic Answer Similarity': semantic_similarity,
-            'Exact Match': exact_match,
-            'nr_correct_queries': correct_queries,
-            'nr_incorrect_queries': self.data.shape[0] - correct_queries,
-        }
-
-    def get_aggregated_scores(self, metric: str) -> float:
-        if metric in ['reciprocal rank', 'multi hit', 'context relevance', 'semantic answer similarity']:
-            return self.data[metric].mean()
-        if metric in ['single hit', 'exact match']:
-            return self.data[metric].sum() / len(self.data)
-
-    def get_detailed_scores(self, metric: str, query_ids: List[str]) -> pd.DataFrame:
-        """Get the detailed scores for all queries or a for a subset of the queries for a given metric"""
-        pass
-
-    def find_thresholds(self, metrics: List[str]) -> Dict[str, float]:
-        """
-        Use the `statistics` module to find the thresholds for the different metrics.
-
-        Some potentially interesting thresholds to find:
-            - the 25th percentile
-            - the 75th percentile
-            - the mean
-            - the median
-        """
-        pass
-
-    def get_scores_below_threshold(self, metric: str, threshold: float):
-        """Get the all the queries with a score below a certain threshold for a given metric"""
-        return self.data[self.data[metric] < threshold]
-
-    def comparative_detailed_summary(self, other: "PipelineEvaluationAPI") -> pd.DataFrame:
-        """
-        - Queries that are answered correctly by both pipelines
-        - Queries that are answered incorrectly by both pipelines
-        - Queries that are answered correctly by only one pipeline
-        """
-
-        # correct by both pipelines
-        both_correct = self.data[(self.data['exact match'] == 1) & (other.data['exact match'] == 1)]['query_id'].tolist()
-
-        # incorrectly by both pipelines
-        both_incorrect = self.data[(self.data['exact match'] == 0) & (other.data['exact match'] == 0)]['query_id'].tolist()
-
-        # queries that are answered correctly by only one pipeline
-        only_this_correct = self.data[(self.data['exact match'] == 1) & (other.data['exact match'] == 0)]['query_id'].tolist()
-        only_other_correct = self.data[(self.data['exact match'] == 0) & (other.data['exact match'] == 1)]['query_id'].tolist()
-
-        columns = ["both_correct", "both_incorrect", f"only_{self.name}_correct", f"only_{other.name}_correct"]
-
-        # make all lists the same length, fill with None, so that we can create a DataFrame
-        max_len = max(len(both_correct), len(both_incorrect))
-        both_correct += ["None"] * (max_len - len(both_correct))
-        both_incorrect += ["None"] * (max_len - len(both_incorrect))
-        only_this_correct += ["None"] * (max_len - len(only_this_correct))
-        only_other_correct += ["None"] * (max_len - len(only_other_correct))
-
-        values = list(zip(both_correct, both_incorrect, only_this_correct, only_other_correct))
-
-        return pd.DataFrame(values, columns=columns)
-
+def get_scores_below_threshold(self, metric: str, threshold: float):
+    """Get the all the queries with a score below a certain threshold for a given metric"""  
 ```
 
 
diff --git a/proposals/text/7462-rag_pipeline_eval.py b/proposals/text/7462-rag_pipeline_eval.py
deleted file mode 100644
index bf58f22528..0000000000
--- a/proposals/text/7462-rag_pipeline_eval.py
+++ /dev/null
@@ -1,161 +0,0 @@
-import uuid
-from random import randint, random
-from typing import Dict, List, Union
-
-import pandas as pd
-
-
-class RAGPipelineEvaluation:
-    def __init__(self, name: str, data: Union[pd.DataFrame, List[Dict[str, Union[str, float]]]]):
-        self.name = name
-        self.data = self._get_mocked_dataframe_single_k_value(n_queries=50)  # this is just to have numbers to show
-
-    @staticmethod
-    def _get_mocked_dataframe_single_k_value(n_queries: int):
-        """
-        Generate a mocked dataframe for evaluation purposes.
-
-        - Reciprocal Rank: 1 / rank of the first correct answer - range [0, 1]
-        - Single Hit: 1 if the first retrieved document is correct, 0 otherwise - binary
-        - Multi Hit: proportion of correct documents in the top k retrieved documents - range [0,1]
-
-        - Context Relevance:
-           for a given query q:
-            - the system first retrieves some context c(q) and then generates an answer a(q)
-            - the context relevance is the number of extracted sentences / number of sentences in the context c(q)
-            - [0,1]
-
-        - Faithfulness:
-            - we say that the answer as(q) is faithful to the context c(q) if the claims that are made in the answer
-              can be inferred from the context.
-            - |V| number of statements that were supported according to the LLM
-            - |S| is the total number of statements.
-            - Faithfulness = |V| / |S|
-            - [0,1]
-
-        - Semantic Answer Similarity: cosine similarity between the generated answer and the correct answer - range [0,1]
-        - Exact Match: 1 if the generated answer is exactly the same as the correct answer, 0 otherwise - binary
-        """
-
-        columns = [
-            "query_id",
-            "reciprocal rank",
-            "single hit",
-            "multi hit",
-            "context relevance",
-            "faithfulness",
-            "semantic answer similarity",
-            "exact match",
-        ]
-
-        query_id = [str(uuid.uuid4()) for _ in range(n_queries)]
-        reciprocal_rank = [random() for _ in range(n_queries)]
-        single_hit = [randint(0, 1) for _ in range(n_queries)]
-        multi_hit = [random() for _ in range(n_queries)]
-        context_relevance = [random() for _ in range(n_queries)]
-        faithfulness = [random() for _ in range(n_queries)]
-        semantic_similarity = [random() for _ in range(n_queries)]
-        exact_match = [randint(0, 1) for _ in range(n_queries)]
-
-        values = list(
-            zip(
-                query_id,
-                reciprocal_rank,
-                single_hit,
-                multi_hit,
-                context_relevance,
-                faithfulness,
-                semantic_similarity,
-                exact_match,
-            )
-        )
-
-        return pd.DataFrame(values, columns=columns)
-
-    def evaluation_report(self) -> Dict[str, float]:
-        """Get the classification report for the different metrics"""
-
-        mrr = self.get_aggregated_scores("reciprocal rank")
-        single_hit = self.get_aggregated_scores("single hit")
-        multi_hit = self.get_aggregated_scores("multi hit")
-        faithfulness = self.get_aggregated_scores("faithfulness")
-        context_relevance = self.get_aggregated_scores("context relevance")
-        semantic_similarity = self.get_aggregated_scores("semantic answer similarity")
-        exact_match = self.get_aggregated_scores("exact match")
-        correct_queries = self.data[self.data["exact match"] == 1].shape[0]
-
-        return {
-            "Reciprocal Rank": mrr,
-            "Single Hit": single_hit,
-            "Multi Hit": multi_hit,
-            "Context Relevance": context_relevance,
-            "Faithfulness": faithfulness,
-            "Semantic Answer Similarity": semantic_similarity,
-            "Exact Match": exact_match,
-            "nr_correct_queries": correct_queries,
-            "nr_incorrect_queries": self.data.shape[0] - correct_queries,
-        }
-
-    def get_aggregated_scores(self, metric: str) -> float:
-        if metric in ["reciprocal rank", "multi hit", "context relevance", "semantic answer similarity"]:
-            return self.data[metric].mean()
-        if metric in ["single hit", "exact match"]:
-            return self.data[metric].sum() / len(self.data)
-
-    def get_detailed_scores(self, metric: str, query_ids: List[str]) -> pd.DataFrame:
-        """Get the detailed scores for all queries or a for a subset of the queries for a given metric"""
-        pass
-
-    def find_thresholds(self, metrics: List[str]) -> Dict[str, float]:
-        """
-        Use the `statistics` module to find the thresholds for the different metrics.
-
-        Some potentially interesting thresholds to find:
-            - the 25th percentile
-            - the 75th percentile
-            - the mean
-            - the median
-        """
-        pass
-
-    def get_scores_below_threshold(self, metric: str, threshold: float):
-        """Get the all the queries with a score below a certain threshold for a given metric"""
-        return self.data[self.data[metric] < threshold]
-
-    def comparative_detailed_summary(self, other: "RAGPipelineEvaluation") -> pd.DataFrame:
-        """
-        - Queries that are answered correctly by both pipelines
-        - Queries that are answered incorrectly by both pipelines
-        - Queries that are answered correctly by only one pipeline
-        """
-
-        # correct by both pipelines
-        both_correct = self.data[(self.data["exact match"] == 1) & (other.data["exact match"] == 1)][
-            "query_id"
-        ].tolist()
-
-        # incorrectly by both pipelines
-        both_incorrect = self.data[(self.data["exact match"] == 0) & (other.data["exact match"] == 0)][
-            "query_id"
-        ].tolist()
-
-        # queries that are answered correctly by only one pipeline
-        only_this_correct = self.data[(self.data["exact match"] == 1) & (other.data["exact match"] == 0)][
-            "query_id"
-        ].tolist()
-        only_other_correct = self.data[(self.data["exact match"] == 0) & (other.data["exact match"] == 1)][
-            "query_id"
-        ].tolist()
-
-        columns = ["both_correct", "both_incorrect", f"only_{self.name}_correct", f"only_{other.name}_correct"]
-
-        # make all lists the same length, fill with None, so that we can create a DataFrame
-        max_len = max(len(both_correct), len(both_incorrect))
-        both_correct += ["None"] * (max_len - len(both_correct))
-        both_incorrect += ["None"] * (max_len - len(both_incorrect))
-        only_this_correct += ["None"] * (max_len - len(only_this_correct))
-        only_other_correct += ["None"] * (max_len - len(only_other_correct))
-
-        values = list(zip(both_correct, both_incorrect, only_this_correct, only_other_correct))
-
-        return pd.DataFrame(values, columns=columns)

From e8f765e5d1fe719c1d2447b2f04220d487708b11 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Mon, 8 Apr 2024 14:58:58 +0200
Subject: [PATCH 08/15] updating proposal

---
 proposals/text/7462-rag-evaluation.md | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/proposals/text/7462-rag-evaluation.md b/proposals/text/7462-rag-evaluation.md
index fb5a4b8e31..c32c1789df 100644
--- a/proposals/text/7462-rag-evaluation.md
+++ b/proposals/text/7462-rag-evaluation.md
@@ -38,12 +38,15 @@ data = {
             {"name": "context_relevance", "scores": [0.805466, 0.410251, 0.750070, 0.361332]},
             {"name": "faithfulness", "scores": [0.135581, 0.695974, 0.749861, 0.041999]},
             {"name": "semantic_answer_similarity", "scores": [0.971241, 0.159320, 0.019722, 1]}
-         ]
-    }
+         ],
+    "pipeline_answers": ["Paris", "Madrid"]
+    },
+
 ```
 
 - At least the `query_id` or the `question and context` should be present in the data structure.
 - At least one of the metrics should be present in the data structure.
+- The `pipeline_answers` field is optional, it is used to compare the answers generated by the pipeline with the expected answers.
 
 
 The `RAGPipelineEvaluation` class provides the following methods to different types of users:
@@ -151,8 +154,13 @@ def detailed_comparative_summary(self, other: "RAGPipelineEvaluation"):
 
 ```bash
 {
-    "pipeline_1": {
-        "query_id": ["53c3b3e6, 225f87f7, 8ac473ec, 97d284ca"],
+    "queries": {
+      "query_id": ["53c3b3e6, 225f87f7, 8ac473ec, 97d284ca"],
+      "question": ["What is the capital of France?", "What is the capital of Spain?"],
+      "contexts": ["wiki_France", "wiki_Spain"],
+      "answer": ["Paris", "Madrid"]  
+    }
+    "pipeline_1": {  
         "reciprocal_rank": [0.378064, 0.534964, 0.216058, 0.778642],
         "single_hit": [1, 1, 0, 1],
         "multi_hit": [0.706125, 0.454976, 0.445512, 0.250522],
@@ -170,7 +178,6 @@ def detailed_comparative_summary(self, other: "RAGPipelineEvaluation"):
           }
     },
     "pipeline_2": {
-        "query_id": ["53c3b3e6, 225f87f7, 8ac473ec, 97d284ca"],
         "reciprocal_rank": [0.378064, 0.534964, 0.216058, 0.778642],
         "single_hit": [1, 1, 0, 1],
         "multi_hit": [0.706125, 0.454976, 0.445512, 0.250522],

From 62583183c0752fc58a48ff6759fd9fe44bb2e44e Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Mon, 8 Apr 2024 16:04:05 +0200
Subject: [PATCH 09/15] Update proposals/text/7462-rag-evaluation.md

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 proposals/text/7462-rag-evaluation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proposals/text/7462-rag-evaluation.md b/proposals/text/7462-rag-evaluation.md
index c32c1789df..8d5dd93b5f 100644
--- a/proposals/text/7462-rag-evaluation.md
+++ b/proposals/text/7462-rag-evaluation.md
@@ -1,4 +1,4 @@
-- Title: Proposal for presentation of RAG evaluation results
+- Title: Proposal for presentation of evaluation results
 - Decision driver: David S. Batista
 - Start Date: 2024-04-03
 - Proposal PR: #7462

From 7996dbb31522a3e0ac1b6298dc9a7d6180a10207 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Mon, 8 Apr 2024 16:00:36 +0200
Subject: [PATCH 10/15] changing name

---
 proposals/text/7462-rag-evaluation.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/proposals/text/7462-rag-evaluation.md b/proposals/text/7462-rag-evaluation.md
index 8d5dd93b5f..7821d9a37a 100644
--- a/proposals/text/7462-rag-evaluation.md
+++ b/proposals/text/7462-rag-evaluation.md
@@ -16,10 +16,10 @@ RAG models are one of them most popular use cases for Haystack. We are adding su
 
 # Detailed design
 
-The output results of an evaluation pipeline composed of `evaluator` components are passed to a `RAGPipelineEvaluation`
+The output results of an evaluation pipeline composed of `evaluator` components are passed to a `EvaluationResults`
 (this is a placeholder name) which stores them internally and acts as an interface to access and present the results.
 
-Example of the data structure that the `RAGPipelineEvaluation` class will receive for initialization:
+Example of the data structure that the `EvaluationResults` class will receive for initialization:
 
 ```python
 
@@ -49,7 +49,7 @@ data = {
 - The `pipeline_answers` field is optional, it is used to compare the answers generated by the pipeline with the expected answers.
 
 
-The `RAGPipelineEvaluation` class provides the following methods to different types of users:
+The `EvaluationResults` class provides the following methods to different types of users:
 
 Basic users:
 - `evaluation_report()`
@@ -119,7 +119,7 @@ A comparative summary that compares the performance of the model with another mo
 for all available metrics.
 
 ```python
-def comparative_summary(self, other: "RAGPipelineEvaluation"):
+def comparative_summary(self, other: "EvaluationResults"):
 ```
 
 ```bash
@@ -149,7 +149,7 @@ available metrics for all queries.
 
 
 ```python
-def detailed_comparative_summary(self, other: "RAGPipelineEvaluation"):
+def detailed_comparative_summary(self, other: "EvaluationResults"):
 ```
 
 ```bash

From 67d2627e52a1f9efc8223143144b775ce1462352 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Mon, 8 Apr 2024 16:23:25 +0200
Subject: [PATCH 11/15] PR comments

---
 proposals/text/7462-rag-evaluation.md | 43 ++++++++++-----------------
 1 file changed, 16 insertions(+), 27 deletions(-)

diff --git a/proposals/text/7462-rag-evaluation.md b/proposals/text/7462-rag-evaluation.md
index 7821d9a37a..5de75bcde3 100644
--- a/proposals/text/7462-rag-evaluation.md
+++ b/proposals/text/7462-rag-evaluation.md
@@ -19,16 +19,19 @@ RAG models are one of them most popular use cases for Haystack. We are adding su
 The output results of an evaluation pipeline composed of `evaluator` components are passed to a `EvaluationResults`
 (this is a placeholder name) which stores them internally and acts as an interface to access and present the results.
 
+The examples below are just for illustrative purposes and are subject to change.
+
 Example of the data structure that the `EvaluationResults` class will receive for initialization:
 
 ```python
 
 data = {
-    "queries": {
+    "inputs": {
         "query_id": ["53c3b3e6", "225f87f7"],
         "question": ["What is the capital of France?", "What is the capital of Spain?"],
         "contexts": ["wiki_France", "wiki_Spain"],
-        "answer": ["Paris", "Madrid"]
+        "answer": ["Paris", "Madrid"],
+        "predicted_answer": ["Paris", "Madrid"]
     },
     "metrics":
         [
@@ -39,29 +42,23 @@ data = {
             {"name": "faithfulness", "scores": [0.135581, 0.695974, 0.749861, 0.041999]},
             {"name": "semantic_answer_similarity", "scores": [0.971241, 0.159320, 0.019722, 1]}
          ],
-    "pipeline_answers": ["Paris", "Madrid"]
     },
 
 ```
 
-- At least the `query_id` or the `question and context` should be present in the data structure.
-- At least one of the metrics should be present in the data structure.
-- The `pipeline_answers` field is optional, it is used to compare the answers generated by the pipeline with the expected answers.
-
-
 The `EvaluationResults` class provides the following methods to different types of users:
 
 Basic users:
-- `evaluation_report()`
-- `comparative_evaluation_summary()`
+- `individual_aggregate_score_report()`
+- `comparative_aggregate_score_report()`
 
 Intermediate users:
-- `detailed_evaluation_report()`
-- `comparative_detailed_evaluation_report()`
+- `individual_detailed_score_report()`
+- `comparative_detailed_score_report`
 
 Advanced users:
 - `find_thresholds()`
-- `find_scores_below_threshold()`
+- `find_inputs_below_threshold()`
 
 
 ### Methods description
@@ -69,7 +66,7 @@ An evaluation report that provides a summary of the performance of the model acr
 aggregated scores for all available metrics.
 
 ```python
-def evaluation_report():
+def individual_aggregate_score_report():
 ```
 
 Example output
@@ -87,7 +84,7 @@ Example output
 A detailed evaluation report that provides the scores of all available metrics for all queries or a subset of queries.
 
 ```python
-def get_detailed_scores(query_ids: Union[List[str], str] = "all"):
+def individual_detailed_score_report(queries: Union[List[str], str] = "all"):
 ```
 
 Example output
@@ -119,7 +116,7 @@ A comparative summary that compares the performance of the model with another mo
 for all available metrics.
 
 ```python
-def comparative_summary(self, other: "EvaluationResults"):
+def comparative_aggregate_score_report(self, other: "EvaluationResults"):
 ```
 
 ```bash
@@ -149,7 +146,7 @@ available metrics for all queries.
 
 
 ```python
-def detailed_comparative_summary(self, other: "EvaluationResults"):
+def comparative_detailed_score_report(self, other: "EvaluationResults"):
 ```
 
 ```bash
@@ -217,17 +214,13 @@ data  = {
 Then have another method that  
 
 ```python
-def get_scores_below_threshold(self, metric: str, threshold: float):
+def find_inputs_below_threshold(self, metric: str, threshold: float):
     """Get the all the queries with a score below a certain threshold for a given metric"""  
 ```
 
-
 # Drawbacks
 
-- Relying on pandas DataFrame internally makes it easy to perform many of the operations.
-- Nevertheless, it can be burden, since we are making `pandas` a dependency of `haystack-ai`.
-- Ideally all the proposed methods should be implemented in a way that doesn't require `pandas`.
-
+-  
 
 # Adoption strategy
 
@@ -237,7 +230,3 @@ def get_scores_below_threshold(self, metric: str, threshold: float):
 
 - A tutorial would be the best approach to teach users how to use this feature.
 - Adding a new entry to the documentation.
-
-# Unresolved questions
-
-- The `comparative_summary()` and the `comparative_detailed_summary()` methods need to be adopted to different definitions of what a correct answer is.

From ecb0b53e5b05b844cc58916ed4dfa9e9ae14d418 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Mon, 8 Apr 2024 16:36:52 +0200
Subject: [PATCH 12/15] changing output to table format

---
 proposals/text/7462-rag-evaluation.md | 77 ++++++---------------------
 1 file changed, 16 insertions(+), 61 deletions(-)

diff --git a/proposals/text/7462-rag-evaluation.md b/proposals/text/7462-rag-evaluation.md
index 5de75bcde3..121adef4db 100644
--- a/proposals/text/7462-rag-evaluation.md
+++ b/proposals/text/7462-rag-evaluation.md
@@ -90,24 +90,12 @@ def individual_detailed_score_report(queries: Union[List[str], str] = "all"):
 Example output
 
 ```bash
-data  = {
-    "query_id": ["53c3b3e6, 225f87f7, 8ac473ec, 97d284ca"],
-    "reciprocal_rank": [0.378064, 0.534964, 0.216058, 0.778642],
-    "single_hit": [1, 1, 0, 1],
-    "multi_hit": [0.706125, 0.454976, 0.445512, 0.250522],
-    "context_relevance": [0.805466, 0.410251, 0.750070, 0.361332],
-    "faithfulness": [0.135581, 0.695974, 0.749861, 0.041999],
-    "semantic_answer_similarity": [0.971241, 0.159320, 0.019722, 1],
-    "aggregated_score":
-      {
-          'Reciprocal Rank': 0.448,
-          'Single Hit': 0.5,
-          'Multi Hit': 0.540,
-          'Context Relevance': 0.537,
-          'Faithfulness': 0.452,
-          'Semantic Answer Similarity': 0.478
-      }
-}
+| query_id | reciprocal_rank | single_hit | multi_hit | context_relevance | faithfulness | semantic_answer_similarity |
+|----------|-----------------|------------|-----------|-------------------|-------------|----------------------------|
+| 53c3b3e6 | 0.378064        | 1          | 0.706125  | 0.805466          | 0.135581    | 0.971241                   |
+| 225f87f7 | 0.534964        | 1          | 0.454976  | 0.410251          | 0.695974    | 0.159320                   |
+| 8ac473ec | 0.216058        | 0          | 0.445512  | 0.750070          | 0.749861    | 0.019722                   |
+| 97d284ca | 0.778642        | 1          | 0.250522  | 0.361332          | 0.041999    | 1                          |
 ```
 
 ### Comparative Evaluation Report
@@ -150,50 +138,15 @@ def comparative_detailed_score_report(self, other: "EvaluationResults"):
 ```
 
 ```bash
-{
-    "queries": {
-      "query_id": ["53c3b3e6, 225f87f7, 8ac473ec, 97d284ca"],
-      "question": ["What is the capital of France?", "What is the capital of Spain?"],
-      "contexts": ["wiki_France", "wiki_Spain"],
-      "answer": ["Paris", "Madrid"]  
-    }
-    "pipeline_1": {  
-        "reciprocal_rank": [0.378064, 0.534964, 0.216058, 0.778642],
-        "single_hit": [1, 1, 0, 1],
-        "multi_hit": [0.706125, 0.454976, 0.445512, 0.250522],
-        "context_relevance": [0.805466, 0.410251, 0.750070, 0.361332],
-        "faithfulness": [0.135581, 0.695974, 0.749861, 0.041999],
-        "semantic_answer_similarity": [0.971241, 0.159320, 0.019722, 1],
-        "aggregated_score":
-          {
-              'Reciprocal Rank': 0.448,
-              'Single Hit': 0.5,
-              'Multi Hit': 0.540,
-              'Context Relevance': 0.537,
-              'Faithfulness': 0.452,
-              'Semantic Answer Similarity': 0.478
-          }
-    },
-    "pipeline_2": {
-        "reciprocal_rank": [0.378064, 0.534964, 0.216058, 0.778642],
-        "single_hit": [1, 1, 0, 1],
-        "multi_hit": [0.706125, 0.454976, 0.445512, 0.250522],
-        "context_relevance": [0.805466, 0.410251, 0.750070, 0.361332],
-        "faithfulness": [0.135581, 0.695974, 0.749861, 0.041999],
-        "semantic_answer_similarity": [0.971241, 0.159320, 0.019722, 1],
-        "aggregated_score":
-          {
-              'Reciprocal Rank': 0.448,
-              'Single Hit': 0.5,
-              'Multi Hit': 0.540,
-              'Context Relevance': 0.537,
-              'Faithfulness': 0.452,
-              'Semantic Answer Similarity': 0.478
-          }
-      }
-}
+| query_id | reciprocal_rank_model_1 | single_hit_model_1 | multi_hit_model_1 | context_relevance_model_1 | faithfulness_model_1 | semantic_answer_similarity_model_1 | reciprocal_rank_model_2 | single_hit_model_2 | multi_hit_model_2 | context_relevance_model_2 | faithfulness_model_2 | semantic_answer_similarity_model_2 |
+|----------|-------------------------|--------------------|-------------------|---------------------------|----------------------|------------------------------------|-------------------------|--------------------|-------------------|---------------------------|----------------------|------------------------------------|
+| 53c3b3e6 | 0.378064                | 1                  | 0.706125          | 0.805466                  | 0.135581            | 0.971241                           | 0.378064                | 1                  | 0.706125          | 0.805466                  | 0.135581            | 0.971241                           |
+| 225f87f7 | 0.534964                | 1                  | 0.454976          | 0.410251                  | 0.695974            | 0.159320                           | 0.534964                | 1                  | 0.454976          | 0.410251                  | 0.695974            | 0.159320                           |
+| 8ac473ec | 0.216058                | 0                  | 0.445512          | 0.750070                  | 0.749861            | 0.019722                           | 0.216058                | 0                  | 0.445512          | 0.750070                  | 0.749861            | 0.019722                           |
+| 97d284ca | 0.778642                | 1                  | 0.250522          | 0.361332                  | 0.041999            | 1                                  | 0.778642                | 1                  | 0.250522          | 0.361332                  | 0.041999            | 1                                  |
 ```
 
+
 Have a method to find interesting scores thresholds, typically used for error analysis, for all metrics available.
 Some potentially interesting thresholds to find are: the 25th percentile, the 75th percentile, the mean , the median.
 
@@ -220,7 +173,9 @@ def find_inputs_below_threshold(self, metric: str, threshold: float):
 
 # Drawbacks
 
--  
+- Having the output in a format table may not be flexible enough, and maybe too verbose, for datasets with a large number of queries.
+- Maybe a JSON format would be better with the option to export to a .csv file.
+
 
 # Adoption strategy
 

From f5a21b9b899823bdae33150bf453b28db7665cd7 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Mon, 8 Apr 2024 17:24:45 +0200
Subject: [PATCH 13/15] adding user stories

---
 proposals/text/7462-rag-evaluation.md | 46 +++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/proposals/text/7462-rag-evaluation.md b/proposals/text/7462-rag-evaluation.md
index 121adef4db..b2a6d319e4 100644
--- a/proposals/text/7462-rag-evaluation.md
+++ b/proposals/text/7462-rag-evaluation.md
@@ -156,7 +156,7 @@ def find_thresholds(self, metrics: List[str]) -> Dict[str, float]:
 
 ```bash
 data  = {
-    "thresholds": ["25th percentile", "75th percentile", "mean", "average"],
+    "thresholds": ["25th percentile", "75th percentile", "median", "average"],
     "reciprocal_rank": [0.378064, 0.534964, 0.216058, 0.778642],
     "context_relevance": [0.805466, 0.410251, 0.750070, 0.361332],
     "faithfulness": [0.135581, 0.695974, 0.749861, 0.041999],
@@ -173,8 +173,9 @@ def find_inputs_below_threshold(self, metric: str, threshold: float):
 
 # Drawbacks
 
-- Having the output in a format table may not be flexible enough, and maybe too verbose, for datasets with a large number of queries.
-- Maybe a JSON format would be better with the option to export to a .csv file.
+- Having the output in a format table may not be flexible enough, and maybe too verbose for datasets with a large number of queries.
+- Maybe the option to export to a .csv file would be better than having the output in a table format.
+- Maybe a JSON format would be better with the option for advanced users to do further analysis and visualization.
 
 
 # Adoption strategy
@@ -185,3 +186,42 @@ def find_inputs_below_threshold(self, metric: str, threshold: float):
 
 - A tutorial would be the best approach to teach users how to use this feature.
 - Adding a new entry to the documentation.
+
+# User stories
+
+### 1. I would like to get a single summary score for my RAG pipeline so I can compare several pipeline configurations.
+
+Run `individual_aggregate_score_report()` and get the following output:
+
+```bash
+{'Reciprocal Rank': 0.448,
+ 'Single Hit': 0.5,
+ 'Multi Hit': 0.540,
+ 'Context Relevance': 0.537,
+ 'Faithfulness': 0.452,
+ 'Semantic Answer Similarity': 0.478
+ }
+ ```
+
+### 2. I am not sure what evaluation metrics work best for my RAG pipeline, specially when using the more novel LLM-based
+
+Use `context relevance` or `faithfulness`
+
+### 3. My RAG pipeline has a low aggregate score, so I would like to see examples of specific inputs where the score was low to be able to diagnose what the issue could be.
+
+Let's say it's a low score in `reciprocal_rank` and one already has an idea of what "low" is a query/question, then simply run:
+
+    find_inputs_below_threshold("reciprocal_rank", <threshold>)
+
+If the low score is in `reciprocal_rank` one can first get thresholds for this metric using:
+
+    `find_thresholds(["reciprocal_rank"])`
+
+this will give:
+
+- 25th percentile: (Q1) the value below which 25% of the data falls.
+- median percentile: (Q2) the value below which 50% of the data falls.
+- 75th percentile: (Q3) the value below which 75% of the data falls.
+
+this can help to decide what is considered a low score, and then get, for instance, queries with a score below
+the Q2 threshold using `find_inputs_below_threshold("context_relevance", threshold)`

From 19697508623964cd91cd949a357510297217608c Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Mon, 8 Apr 2024 17:40:01 +0200
Subject: [PATCH 14/15] Update proposals/text/7462-rag-evaluation.md

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 proposals/text/7462-rag-evaluation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proposals/text/7462-rag-evaluation.md b/proposals/text/7462-rag-evaluation.md
index b2a6d319e4..289a1ce9af 100644
--- a/proposals/text/7462-rag-evaluation.md
+++ b/proposals/text/7462-rag-evaluation.md
@@ -54,7 +54,7 @@ Basic users:
 
 Intermediate users:
 - `individual_detailed_score_report()`
-- `comparative_detailed_score_report`
+- `comparative_detailed_score_report()`
 
 Advanced users:
 - `find_thresholds()`

From b1b6162ba861f96bcc035afb0b87270c1168e6c9 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Mon, 8 Apr 2024 17:50:43 +0200
Subject: [PATCH 15/15] adding user stories

---
 proposals/text/7462-rag-evaluation.md | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/proposals/text/7462-rag-evaluation.md b/proposals/text/7462-rag-evaluation.md
index 289a1ce9af..21935ce88f 100644
--- a/proposals/text/7462-rag-evaluation.md
+++ b/proposals/text/7462-rag-evaluation.md
@@ -90,12 +90,10 @@ def individual_detailed_score_report(queries: Union[List[str], str] = "all"):
 Example output
 
 ```bash
-| query_id | reciprocal_rank | single_hit | multi_hit | context_relevance | faithfulness | semantic_answer_similarity |
-|----------|-----------------|------------|-----------|-------------------|-------------|----------------------------|
-| 53c3b3e6 | 0.378064        | 1          | 0.706125  | 0.805466          | 0.135581    | 0.971241                   |
-| 225f87f7 | 0.534964        | 1          | 0.454976  | 0.410251          | 0.695974    | 0.159320                   |
-| 8ac473ec | 0.216058        | 0          | 0.445512  | 0.750070          | 0.749861    | 0.019722                   |
-| 97d284ca | 0.778642        | 1          | 0.250522  | 0.361332          | 0.041999    | 1                          |
+| question | context | answer | predicted_answer | reciprocal_rank | single_hit | multi_hit | context_relevance | faithfulness | semantic_answer_similarity |
+|----------|---------|--------|------------------|-----------------|------------|-----------|-------------------|-------------|----------------------------|
+| What is the capital of France? | wiki_France | Paris | Paris | 0.378064 | 1 | 0.706125 | 0.805466 | 0.135581 | 0.971241 |
+| What is the capital of Spain? | wiki_Spain | Madrid | Madrid | 0.534964 | 1 | 0.454976 | 0.410251 | 0.695974 | 0.159320 |
 ```
 
 ### Comparative Evaluation Report
@@ -138,13 +136,11 @@ def comparative_detailed_score_report(self, other: "EvaluationResults"):
 ```
 
 ```bash
-| query_id | reciprocal_rank_model_1 | single_hit_model_1 | multi_hit_model_1 | context_relevance_model_1 | faithfulness_model_1 | semantic_answer_similarity_model_1 | reciprocal_rank_model_2 | single_hit_model_2 | multi_hit_model_2 | context_relevance_model_2 | faithfulness_model_2 | semantic_answer_similarity_model_2 |
-|----------|-------------------------|--------------------|-------------------|---------------------------|----------------------|------------------------------------|-------------------------|--------------------|-------------------|---------------------------|----------------------|------------------------------------|
-| 53c3b3e6 | 0.378064                | 1                  | 0.706125          | 0.805466                  | 0.135581            | 0.971241                           | 0.378064                | 1                  | 0.706125          | 0.805466                  | 0.135581            | 0.971241                           |
-| 225f87f7 | 0.534964                | 1                  | 0.454976          | 0.410251                  | 0.695974            | 0.159320                           | 0.534964                | 1                  | 0.454976          | 0.410251                  | 0.695974            | 0.159320                           |
-| 8ac473ec | 0.216058                | 0                  | 0.445512          | 0.750070                  | 0.749861            | 0.019722                           | 0.216058                | 0                  | 0.445512          | 0.750070                  | 0.749861            | 0.019722                           |
-| 97d284ca | 0.778642                | 1                  | 0.250522          | 0.361332                  | 0.041999            | 1                                  | 0.778642                | 1                  | 0.250522          | 0.361332                  | 0.041999            | 1                                  |
-```
+| question | context | answer | predicted_answer_model_1 | predicted_answer_model_2 | reciprocal_rank_model_1 | reciprocal_rank_model_2 | single_hit_model_1 | single_hit_model_2 | multi_hit_model_1 | multi_hit_model_2 | context_relevance_model_1 | context_relevance_model_2 | faithfulness_model_1 | faithfulness_model_2 | semantic_answer_similarity_model_1 | semantic_answer_similarity_model_2 |
+|----------|---------|--------|--------------------------|--------------------------|-------------------------|-------------------------|--------------------|--------------------|-------------------|-------------------|---------------------------|---------------------------|----------------------|----------------------|------------------------------------|------------------------------------|
+| What is the capital of France? | wiki_France | Paris | Paris | Paris | 0.378064 | 0.378064 | 1 | 1 | 0.706125 | 0.706125 | 0.805466 | 0.805466 | 0.135581 | 0.135581 | 0.971241 | 0.971241 |
+| What is the capital of Spain? | wiki_Spain | Madrid | Madrid | Madrid | 0.534964 | 0.534964 | 1 | 1 | 0.454976 | 0.454976 | 0.410251 | 0.410251 | 0.695974 | 0.695974 | 0.159320 | 0.159320 |
+````
 
 
 Have a method to find interesting scores thresholds, typically used for error analysis, for all metrics available.