From 81f1bbaf989d41b42af0fba16364f040d9cfb399 Mon Sep 17 00:00:00 2001
From: shadeMe <shadeMe@users.noreply.github.com>
Date: Fri, 5 Jul 2024 12:26:07 +0200
Subject: [PATCH] feat: Add notebook for RAG eval harness

---
 examples/rag_eval_harness.ipynb | 1678 +++++++++++++++++++++++++++++++
 1 file changed, 1678 insertions(+)
 create mode 100644 examples/rag_eval_harness.ipynb

diff --git a/examples/rag_eval_harness.ipynb b/examples/rag_eval_harness.ipynb
new file mode 100644
index 00000000..530f9537
--- /dev/null
+++ b/examples/rag_eval_harness.ipynb
@@ -0,0 +1,1678 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "# We assume that the haystack-experimental package is already installed.\n",
+    "pip install datasets\n",
+    "pip install sentence-transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's set the OpenAI API key environment variable to ensure that\n",
+    "# LLM-based evaluators can query the OpenAI API.\n",
+    "import os\n",
+    "from getpass import getpass\n",
+    "if \"OPENAI_API_KEY\" not in os.environ:\n",
+    "  os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "'NoneType' object has no attribute 'cadam32bit_grad_fp32'\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/mkannan/.pyenv/versions/3.10.13/envs/dev/lib/python3.10/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
+      "  warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
+     ]
+    }
+   ],
+   "source": [
+    "# All the imports that we'll need to create the following:\n",
+    "#  - An indexing pipeline that stores documents from our chosen dataset in a document store.\n",
+    "#  - A retrieval pipeline that uses a query to retrieve relevant documents from the document store.\n",
+    "import json\n",
+    "from typing import List, Dict\n",
+    "from collections import defaultdict\n",
+    "from pathlib import Path\n",
+    "import random\n",
+    "from datasets import load_dataset, Dataset\n",
+    "from tqdm import tqdm\n",
+    "import shutil\n",
+    "\n",
+    "from haystack import Document, Pipeline\n",
+    "from haystack.components.builders import AnswerBuilder, PromptBuilder\n",
+    "from haystack.components.embedders import (\n",
+    "    SentenceTransformersDocumentEmbedder,\n",
+    "    SentenceTransformersTextEmbedder,\n",
+    ")\n",
+    "from haystack.components.generators import OpenAIGenerator\n",
+    "from haystack.components.retrievers import (\n",
+    "    InMemoryEmbeddingRetriever,\n",
+    "    InMemoryBM25Retriever,\n",
+    ")\n",
+    "from haystack.components.writers import DocumentWriter\n",
+    "\n",
+    "from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
+    "from haystack.document_stores.types import DuplicatePolicy, DocumentStore\n",
+    "from haystack_experimental.evaluation.harness.rag import (\n",
+    "    DefaultRAGArchitecture,\n",
+    "    RAGEvaluationHarness,\n",
+    "    RAGEvaluationMetric,\n",
+    "    RAGEvaluationInput,\n",
+    "    RAGEvaluationOutput,\n",
+    "    RAGEvaluationOverrides,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Dataset preparation\n",
+    "\n",
+    "The following steps will load the SQUAD dataset, preprocess them for the indexing pipeline and store them to a local folder in the current working directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Helper functions to load the SQUAD dataset.\n",
+    "def aggregate_wiki_title(data: Dataset, agg_wiki_title: Dict[str, Dict[str, List[str]]]):\n",
+    "    for idx, x in enumerate(data.iter(batch_size=1)):\n",
+    "        if x[\"context\"] not in agg_wiki_title[x[\"title\"][0]][\"context\"]:\n",
+    "            agg_wiki_title[x[\"title\"][0]][\"context\"].append(x[\"context\"])\n",
+    "        agg_wiki_title[x[\"title\"][0]][\"question_answers\"].append(\n",
+    "            {\"question\": x[\"question\"], \"answers\": x[\"answers\"]}\n",
+    "        )\n",
+    "\n",
+    "def load_transformed_squad():\n",
+    "    with open(\"transformed_squad/questions.jsonl\", \"r\") as f:\n",
+    "        questions = [json.loads(x) for x in f.readlines()]\n",
+    "    for idx, question in enumerate(questions):\n",
+    "        question[\"query_id\"] = f\"query_{idx}\"\n",
+    "\n",
+    "    def create_document(text: str, name: str):\n",
+    "        return Document(content=text, meta={\"name\": name})\n",
+    "\n",
+    "    # walk through the files in the directory and transform each text file into a Document\n",
+    "    documents = []\n",
+    "    for root, dirs, files in os.walk(\"transformed_squad/articles/\"):\n",
+    "        for article in files:\n",
+    "            with open(f\"{root}/{article}\", \"r\") as f:\n",
+    "                raw_texts = f.read().split(\"\\n\")\n",
+    "                for text in raw_texts:\n",
+    "                    documents.append(\n",
+    "                        create_document(text, article.replace(\".txt\", \"\"))\n",
+    "                    )\n",
+    "\n",
+    "    return questions, documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 490/490 [00:00<00:00, 57035.27it/s]\n",
+      "100%|██████████| 490/490 [00:00<00:00, 9517.10it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ".\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_train = load_dataset(\"squad\", split=\"train\")\n",
+    "data_validation = load_dataset(\"squad\", split=\"validation\")\n",
+    "agg_wiki_title = defaultdict(\n",
+    "    lambda: {\"context\": [], \"question_answers\": [], \"text\": \"\"}\n",
+    ")\n",
+    "aggregate_wiki_title(data_train, agg_wiki_title)\n",
+    "aggregate_wiki_title(data_validation, agg_wiki_title)\n",
+    "\n",
+    "# merge the context into a single document\n",
+    "for article in tqdm(agg_wiki_title.keys()):\n",
+    "    agg_wiki_title[article][\"text\"] = \"\\n\".join(\n",
+    "        [x[0] for x in agg_wiki_title[article][\"context\"]]\n",
+    "    )\n",
+    "\n",
+    "# create documents\n",
+    "for article in tqdm(agg_wiki_title.keys()):\n",
+    "    out_path = Path(\"transformed_squad/articles/\")\n",
+    "    out_path.mkdir(parents=True, exist_ok=True)\n",
+    "    with open(f\"{str(out_path)}/{article}.txt\", \"w\") as f:\n",
+    "        f.write(agg_wiki_title[article][\"text\"])\n",
+    "\n",
+    "# create question/answers\n",
+    "questions = Path(\"transformed_squad/\")\n",
+    "questions.mkdir(parents=True, exist_ok=True)\n",
+    "with open(f\"{str(questions)}/questions.jsonl\", \"w\") as f:\n",
+    "    for article in agg_wiki_title.keys():\n",
+    "        for entry in agg_wiki_title[article][\"question_answers\"]:\n",
+    "            f.write(\n",
+    "                json.dumps(\n",
+    "                    {\n",
+    "                        \"question\": entry[\"question\"][0],\n",
+    "                        \"document\": article,\n",
+    "                        \"answers\": entry[\"answers\"][0],\n",
+    "                    }\n",
+    "                )\n",
+    "                + \"\\n\"\n",
+    "            )\n",
+    "\n",
+    "questions, documents = load_transformed_squad()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Indexing pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Helper function to create a pipeline that indexes the documents in the document store.\n",
+    "def indexing(documents: List[Document]) -> InMemoryDocumentStore:\n",
+    "    document_store = InMemoryDocumentStore()\n",
+    "\n",
+    "    doc_writer = DocumentWriter(\n",
+    "        document_store=document_store, policy=DuplicatePolicy.SKIP\n",
+    "    )\n",
+    "    doc_embedder = SentenceTransformersDocumentEmbedder(\n",
+    "        model=\"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "    )\n",
+    "\n",
+    "    ingestion_pipe = Pipeline()\n",
+    "    ingestion_pipe.add_component(instance=doc_embedder, name=\"doc_embedder\")\n",
+    "    ingestion_pipe.add_component(instance=doc_writer, name=\"doc_writer\")\n",
+    "\n",
+    "    ingestion_pipe.connect(\"doc_embedder.documents\", \"doc_writer.documents\")\n",
+    "    ingestion_pipe.run({\"doc_embedder\": {\"documents\": documents}})\n",
+    "\n",
+    "    return document_store"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/mkannan/.pyenv/versions/3.10.13/envs/dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1dd82506edbf4c7dbf0f4f1cda0d7902",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/32 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "failed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries\n",
+      "\u001b[2m2024-07-05T11:15:29.773618Z\u001b[0m [\u001b[31m\u001b[1merror    \u001b[0m] \u001b[1mfailed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m399\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mfailed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mddtrace.internal.writer.writer\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Let's select a subset of documents to index to speed up the process.\n",
+    "\n",
+    "documents = random.sample(documents, 1000)\n",
+    "document_store = indexing(documents)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Retrieval pipeline\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Helper function to create an embedding-based RAG pipeline.\n",
+    "def build_emb_rag_pipeline(document_store: InMemoryDocumentStore, top_k: int = 2) -> Pipeline:\n",
+    "    template = \"\"\"\n",
+    "        You have to answer the following question based on the given context information only.\n",
+    "\n",
+    "        Context:\n",
+    "        {% for document in documents %}\n",
+    "            {{ document.content }}\n",
+    "        {% endfor %}\n",
+    "\n",
+    "        Question: {{question}}\n",
+    "        Answer:\n",
+    "        \"\"\"\n",
+    "\n",
+    "    pipeline = Pipeline()\n",
+    "    pipeline.add_component(\n",
+    "        \"query_embedder\",\n",
+    "        SentenceTransformersTextEmbedder(\n",
+    "            model=\"sentence-transformers/all-MiniLM-L6-v2\",\n",
+    "            progress_bar=False,\n",
+    "        ),\n",
+    "    )\n",
+    "    pipeline.add_component(\n",
+    "        \"retriever\", InMemoryEmbeddingRetriever(document_store, top_k=top_k)\n",
+    "    )\n",
+    "    pipeline.add_component(\"prompt_builder\", PromptBuilder(template=template))\n",
+    "    pipeline.add_component(\n",
+    "        \"generator\", OpenAIGenerator(model=\"gpt-3.5-turbo\")\n",
+    "    )\n",
+    "    pipeline.add_component(\"answer_builder\", AnswerBuilder())\n",
+    "\n",
+    "    pipeline.connect(\"query_embedder\", \"retriever.query_embedding\")\n",
+    "    pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n",
+    "    pipeline.connect(\"prompt_builder\", \"generator\")\n",
+    "    pipeline.connect(\"generator.replies\", \"answer_builder.replies\")\n",
+    "    pipeline.connect(\"generator.meta\", \"answer_builder.meta\")\n",
+    "    pipeline.connect(\"retriever\", \"answer_builder.documents\")\n",
+    "\n",
+    "    return pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Helper function to create an keyword-based RAG pipeline.\n",
+    "def build_keyword_rag_pipeline(document_store: InMemoryDocumentStore, top_k: int = 2) -> Pipeline:\n",
+    "    template = \"\"\"\n",
+    "        You have to answer the following question based on the given context information only.\n",
+    "\n",
+    "        Context:\n",
+    "        {% for document in documents %}\n",
+    "            {{ document.content }}\n",
+    "        {% endfor %}\n",
+    "\n",
+    "        Question: {{question}}\n",
+    "        Answer:\n",
+    "        \"\"\"\n",
+    "\n",
+    "    pipeline = Pipeline()\n",
+    "    pipeline.add_component(\n",
+    "        \"retriever\", InMemoryBM25Retriever(document_store, top_k=top_k)\n",
+    "    )\n",
+    "    pipeline.add_component(\"prompt_builder\", PromptBuilder(template=template))\n",
+    "    pipeline.add_component(\n",
+    "        \"generator\", OpenAIGenerator(model=\"gpt-3.5-turbo\")\n",
+    "    )\n",
+    "    pipeline.add_component(\"answer_builder\", AnswerBuilder())\n",
+    "\n",
+    "    pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n",
+    "    pipeline.connect(\"prompt_builder\", \"generator\")\n",
+    "    pipeline.connect(\"generator.replies\", \"answer_builder.replies\")\n",
+    "    pipeline.connect(\"generator.meta\", \"answer_builder.meta\")\n",
+    "    pipeline.connect(\"retriever\", \"answer_builder.documents\")\n",
+    "\n",
+    "    return pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "emb_rag_pipeline = build_emb_rag_pipeline(document_store, top_k=2)\n",
+    "keyword_rag_pipeline = build_keyword_rag_pipeline(document_store, top_k=2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluation harness\n",
+    "\n",
+    "The RAG evaluation harness comes with a predefined set of evaluation metrics, which are enumerated in the `RAGEvaluationMetric` enum. \n",
+    "\n",
+    "TODO: talk about the default rag architectures, etc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a harness to evalaute the embedding-based RAG pipeline.\n",
+    "emb_eval_harness = RAGEvaluationHarness(emb_rag_pipeline, \n",
+    "                                        rag_components=DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL,\n",
+    "                                        metrics={\n",
+    "                                            RAGEvaluationMetric.DOCUMENT_MAP,\n",
+    "                                            RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,\n",
+    "                                            RAGEvaluationMetric.FAITHFULNESS\n",
+    "                                        })\n",
+    "keyword_eval_harness = RAGEvaluationHarness(keyword_rag_pipeline,\n",
+    "                                            rag_components=DefaultRAGArchitecture.GENERATION_WITH_KEYWORD_RETRIEVAL,\n",
+    "                                            metrics={\n",
+    "                                                RAGEvaluationMetric.DOCUMENT_MAP,\n",
+    "                                                RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,\n",
+    "                                                RAGEvaluationMetric.FAITHFULNESS\n",
+    "                                            })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize the inputs to the evaluation harness.\n",
+    "# These inputs will be automatically passed to RAG pipeline \n",
+    "# and the evaluation pipeline that the harness internally uses.\n",
+    "\n",
+    "input_questions = random.sample(questions, 10)\n",
+    "\n",
+    "eval_harness_input = RAGEvaluationInput(\n",
+    "    queries=[q[\"question\"] for q in input_questions],\n",
+    "    ground_truth_answers=[q[\"answers\"][\"text\"][0] for q in input_questions],\n",
+    "    ground_truth_documents=[\n",
+    "        [\n",
+    "            doc\n",
+    "            for doc in document_store.storage.values()\n",
+    "            if doc.meta[\"name\"] == q[\"document\"]\n",
+    "        ]\n",
+    "        for q in input_questions\n",
+    "    ],\n",
+    "    rag_pipeline_inputs={\n",
+    "        \"prompt_builder\": {\"question\": [q[\"question\"] for q in input_questions]},\n",
+    "        \"answer_builder\": {\"query\": [q[\"question\"] for q in input_questions]},\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b6d13904c0a84eb7b8598bd7a78c75b1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1d22c96dc2704e9fb62116d058e59fae",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f2920f7688884424a79f7c0bcd91df2d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "failed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 1 additional messages skipped\n",
+      "\u001b[2m2024-07-05T11:16:01.040455Z\u001b[0m [\u001b[31m\u001b[1merror    \u001b[0m] \u001b[1mfailed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 1 additional messages skipped\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m399\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mfailed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 1 additional messages skipped\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mddtrace.internal.writer.writer\u001b[0m\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "af33ef6f2f8541649b65dea5f2609927",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "68409a5bfbf84d0cb78cb2fe25ef7cbd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b0e4325175fd4582bb621b1ea8e125fa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3f08fdba18b248bfafcd4939743c754d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dc068a77cd2a4dc0982446cbb3ea7e91",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3673c76765ce4b17b1971921c9502a38",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "49de1dec99604782bd31a1b371155bec",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 10/10 [00:11<00:00,  1.15s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Launch an evaluation run with the above inputs.\n",
+    "emb_eval_run = emb_eval_harness.run(inputs=eval_harness_input, run_name=\"emb_eval_run\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Results of the evaluation run: emb_eval_run\n",
+      "Serialized RAG pipeline: components:\n",
+      "  answer_builder:\n",
+      "    init_parameters:\n",
+      "      pattern: null\n",
+      "      reference_pattern: null\n",
+      "    type: haystack.components.builders.answer_builder.AnswerBuilder\n",
+      "  generator:\n",
+      "    init_parameters:\n",
+      "      api_base_url: null\n",
+      "      api_key:\n",
+      "        env_vars:\n",
+      "        - OPENAI_API_KEY\n",
+      "        strict: true\n",
+      "        type: env_var\n",
+      "      generation_kwargs: {}\n",
+      "      model: gpt-3.5-turbo\n",
+      "      organization: null\n",
+      "      streaming_callback: null\n",
+      "      system_prompt: null\n",
+      "    type: haystack.components.generators.openai.OpenAIGenerator\n",
+      "  prompt_builder:\n",
+      "    init_parameters:\n",
+      "      required_variables: null\n",
+      "      template: \"\\n        You have to answer the following question based on the\\\n",
+      "        \\ given context information only.\\n\\n        Context:\\n        {% for document\\\n",
+      "        \\ in documents %}\\n            {{ document.content }}\\n        {% endfor %}\\n\\\n",
+      "        \\n        Question: {{question}}\\n        Answer:\\n        \"\n",
+      "      variables: null\n",
+      "    type: haystack.components.builders.prompt_builder.PromptBuilder\n",
+      "  query_embedder:\n",
+      "    init_parameters:\n",
+      "      batch_size: 32\n",
+      "      device:\n",
+      "        device: mps\n",
+      "        type: single\n",
+      "      model: sentence-transformers/all-MiniLM-L6-v2\n",
+      "      normalize_embeddings: false\n",
+      "      prefix: ''\n",
+      "      progress_bar: true\n",
+      "      suffix: ''\n",
+      "      token:\n",
+      "        env_vars:\n",
+      "        - HF_API_TOKEN\n",
+      "        - HF_TOKEN\n",
+      "        strict: false\n",
+      "        type: env_var\n",
+      "      trust_remote_code: false\n",
+      "    type: haystack.components.embedders.sentence_transformers_text_embedder.SentenceTransformersTextEmbedder\n",
+      "  retriever:\n",
+      "    init_parameters:\n",
+      "      document_store:\n",
+      "        init_parameters:\n",
+      "          bm25_algorithm: BM25L\n",
+      "          bm25_parameters: {}\n",
+      "          bm25_tokenization_regex: (?u)\\b\\w\\w+\\b\n",
+      "          embedding_similarity_function: dot_product\n",
+      "          index: 1dc33dd9-814e-455c-9123-717d2d33daeb\n",
+      "        type: haystack.document_stores.in_memory.document_store.InMemoryDocumentStore\n",
+      "      filter_policy: replace\n",
+      "      filters: null\n",
+      "      return_embedding: false\n",
+      "      scale_score: false\n",
+      "      top_k: 2\n",
+      "    type: haystack.components.retrievers.in_memory.embedding_retriever.InMemoryEmbeddingRetriever\n",
+      "connections:\n",
+      "- receiver: retriever.query_embedding\n",
+      "  sender: query_embedder.embedding\n",
+      "- receiver: prompt_builder.documents\n",
+      "  sender: retriever.documents\n",
+      "- receiver: answer_builder.documents\n",
+      "  sender: retriever.documents\n",
+      "- receiver: generator.prompt\n",
+      "  sender: prompt_builder.prompt\n",
+      "- receiver: answer_builder.replies\n",
+      "  sender: generator.replies\n",
+      "- receiver: answer_builder.meta\n",
+      "  sender: generator.meta\n",
+      "max_loops_allowed: 100\n",
+      "metadata: {}\n",
+      "\n",
+      "Serialized evaluation pipeline: components:\n",
+      "  metric_doc_map:\n",
+      "    init_parameters: {}\n",
+      "    type: haystack.components.evaluators.document_map.DocumentMAPEvaluator\n",
+      "  metric_doc_recall_single:\n",
+      "    init_parameters:\n",
+      "      mode: single_hit\n",
+      "    type: haystack.components.evaluators.document_recall.DocumentRecallEvaluator\n",
+      "  metric_faithfulness:\n",
+      "    init_parameters:\n",
+      "      api: openai\n",
+      "      api_key:\n",
+      "        env_vars:\n",
+      "        - OPENAI_API_KEY\n",
+      "        strict: true\n",
+      "        type: env_var\n",
+      "      examples:\n",
+      "      - inputs:\n",
+      "          contexts:\n",
+      "          - Berlin is the capital of Germany and was founded in 1244.\n",
+      "          predicted_answers: The capital of Germany, Berlin, was founded in the 13th\n",
+      "            century.\n",
+      "          questions: What is the capital of Germany and when was it founded?\n",
+      "        outputs:\n",
+      "          statement_scores:\n",
+      "          - 1\n",
+      "          - 1\n",
+      "          statements:\n",
+      "          - Berlin is the capital of Germany.\n",
+      "          - Berlin was founded in 1244.\n",
+      "      - inputs:\n",
+      "          contexts:\n",
+      "          - Berlin is the capital of Germany.\n",
+      "          predicted_answers: Paris\n",
+      "          questions: What is the capital of France?\n",
+      "        outputs:\n",
+      "          statement_scores:\n",
+      "          - 0\n",
+      "          statements:\n",
+      "          - Paris is the capital of France.\n",
+      "      - inputs:\n",
+      "          contexts:\n",
+      "          - Rome is the capital of Italy.\n",
+      "          predicted_answers: Rome is the capital of Italy with more than 4 million\n",
+      "            inhabitants.\n",
+      "          questions: What is the capital of Italy?\n",
+      "        outputs:\n",
+      "          statement_scores:\n",
+      "          - 1\n",
+      "          - 0\n",
+      "          statements:\n",
+      "          - Rome is the capital of Italy.\n",
+      "          - Rome has more than 4 million inhabitants.\n",
+      "      progress_bar: true\n",
+      "      raise_on_failure: false\n",
+      "    type: haystack.components.evaluators.faithfulness.FaithfulnessEvaluator\n",
+      "connections: []\n",
+      "max_loops_allowed: 100\n",
+      "metadata: {}\n",
+      "\n",
+      "Inputs: RAGEvaluationInput(queries=['In what year did Cortes send the first cochineal to Spain?', 'What issues may prevent women from working outside the home or receiving education?', 'What was a neologism expressing the introduction of new ideas via translation?', 'When were images being used to promote the spread of Lutheranism?', 'In the layered model of the Earth, the mantle has two layers below it. What are they? ', 'What ancient religious scriptures were among the first examples of Indian literature?', 'How many times was Tom shot?', 'What was persistent unemployment have a negative effect on?', 'What lesson did Johann von Staupitz teach Luther  repentance was?', 'What is the German word for living space?'], ground_truth_documents=[[Document(id=d1104729b6d3b2b5414e41915fcaf98540f4f9c4f774934d8589a2d3891ea15c, content: 'In the United States, political commentators often refer to the \"red states\", which traditionally vo...', meta: {'name': 'Red'}, embedding: vector of size 384), Document(id=25de38544b83727ea429d799159f4f92b3c4a5a92411226c4f540a8809b12e75, content: 'The 19th century also saw the use of red in art to create specific emotions, not just to imitate nat...', meta: {'name': 'Red'}, embedding: vector of size 384)], [Document(id=7a10c0e5e9937ed0a58bd24cd6a52f3982a657c7a745fbf3871dde4e8a2d18da, content: 'A number of researchers (David Rodda, Jacob Vigdor, and Janna Matlack), argue that a shortage of aff...', meta: {'name': 'Economic_inequality'}, embedding: vector of size 384), Document(id=dcfd830caa62ec7a721a9eb1d30f1981c11a72a7eb4cf708e7b136b084865264, content: 'In 2014, economists with the Standard & Poor's rating agency concluded that the widening disparity b...', meta: {'name': 'Economic_inequality'}, embedding: vector of size 384)], [Document(id=01093503983aae500c7daeead0ebcf24923f9e9eaa9b5b7640a1ec2d6c2dc976, content: 'After World War I, when Britain and France divided up the Middle East's countries, apart from Turkey...', meta: {'name': 'Translation'}, embedding: vector of size 384)], [Document(id=e26755f5efaf4498a804115ef47d21864740a1d0fe2e01bf64fc30a0ffca16cb, content: 'Luther wrote \"Ach Gott, vom Himmel sieh darein\" (\"Oh God, look down from heaven\"). \"Nun komm, der He...', meta: {'name': 'Martin_Luther'}, embedding: vector of size 384), Document(id=5769f3d25cf4fd8c5677cee4b2547e6497a24838ffa420d83d80c641a7c4ff30, content: 'His poor physical health made him short-tempered and even harsher in his writings and comments. His ...', meta: {'name': 'Martin_Luther'}, embedding: vector of size 384), Document(id=dc51edb60f7709317edeae0b8c053216d179f59889c61f0c686ac8a004fb16b3, content: 'Pope Leo X was used to reformers and heretics, and he responded slowly, \"with great care as is prope...', meta: {'name': 'Martin_Luther'}, embedding: vector of size 384), Document(id=c2c2e22ede348858f3f2f9ea3fa69ac89a545535023150a8cb3fe8ffdc80364a, content: 'Luther had published his German translation of the New Testament in 1522, and he and his collaborato...', meta: {'name': 'Martin_Luther'}, embedding: vector of size 384)], [Document(id=7c672fa35501f65de5d58440110c39f2617a2fdd6dc697103cbbc521df3dd739, content: 'The principle of inclusions and components states that, with sedimentary rocks, if inclusions (or cl...', meta: {'name': 'Geology'}, embedding: vector of size 384)], [Document(id=c0bf9eb1ddf96253f9af1deec4c520689c57d21b2c4a3767ff81f2705c510d87, content: 'Poetry is a form of literary art which uses aesthetic and rhythmic qualities of language to evoke me...', meta: {'name': 'Literature'}, embedding: vector of size 384), Document(id=fd94a8f1cd25037d32e9d076e5e8e872510dca2911bafa8b4853a0effef5b36b, content: 'Law offers more ambiguity. Some writings of Plato and Aristotle, the law tables of Hammurabi of Baby...', meta: {'name': 'Literature'}, embedding: vector of size 384), Document(id=fdeda0e558e1224c738b80d556448533b07a5503ab3c9145168ee4c3270effa3, content: 'Maslow’s ‘‘Third Force Psychology Theory’’ even allows literary analysts to critically understand ho...', meta: {'name': 'Literature'}, embedding: vector of size 384)], [Document(id=614ded65dc738fbcb1b4b069636318f6e8bff63782aeeec7e7775a2e4f0b9a89, content: 'Sergel's play toured in the UK starting at West Yorkshire Playhouse in Leeds in 2006, and again in 2...', meta: {'name': 'To_Kill_a_Mockingbird'}, embedding: vector of size 384), Document(id=c4bc0ebd0952bbac1b638b97bf0b56a63b983480d8dcfceccf70e7b65f8cf7f4, content: 'When the book was released, reviewers noted that it was divided into two parts, and opinion was mixe...', meta: {'name': 'To_Kill_a_Mockingbird'}, embedding: vector of size 384)], [Document(id=7a10c0e5e9937ed0a58bd24cd6a52f3982a657c7a745fbf3871dde4e8a2d18da, content: 'A number of researchers (David Rodda, Jacob Vigdor, and Janna Matlack), argue that a shortage of aff...', meta: {'name': 'Economic_inequality'}, embedding: vector of size 384), Document(id=dcfd830caa62ec7a721a9eb1d30f1981c11a72a7eb4cf708e7b136b084865264, content: 'In 2014, economists with the Standard & Poor's rating agency concluded that the widening disparity b...', meta: {'name': 'Economic_inequality'}, embedding: vector of size 384)], [Document(id=e26755f5efaf4498a804115ef47d21864740a1d0fe2e01bf64fc30a0ffca16cb, content: 'Luther wrote \"Ach Gott, vom Himmel sieh darein\" (\"Oh God, look down from heaven\"). \"Nun komm, der He...', meta: {'name': 'Martin_Luther'}, embedding: vector of size 384), Document(id=5769f3d25cf4fd8c5677cee4b2547e6497a24838ffa420d83d80c641a7c4ff30, content: 'His poor physical health made him short-tempered and even harsher in his writings and comments. His ...', meta: {'name': 'Martin_Luther'}, embedding: vector of size 384), Document(id=dc51edb60f7709317edeae0b8c053216d179f59889c61f0c686ac8a004fb16b3, content: 'Pope Leo X was used to reformers and heretics, and he responded slowly, \"with great care as is prope...', meta: {'name': 'Martin_Luther'}, embedding: vector of size 384), Document(id=c2c2e22ede348858f3f2f9ea3fa69ac89a545535023150a8cb3fe8ffdc80364a, content: 'Luther had published his German translation of the New Testament in 1522, and he and his collaborato...', meta: {'name': 'Martin_Luther'}, embedding: vector of size 384)], [Document(id=f85c870017d4818f7fc420cf435f55bd102a204475bf7b6344bb55fcdedebe4a, content: 'During World War II, Hitler's Generalplan Ost (general plan for the East) entailed killing, deportin...', meta: {'name': 'Slavs'}, embedding: vector of size 384), Document(id=6819b7729b872e556146b39f2b47037e1453e2973f9fd9e13b0826d885164564, content: 'Because of the vastness and diversity of the territory occupied by Slavic people, there were several...', meta: {'name': 'Slavs'}, embedding: vector of size 384), Document(id=e7a351743626765394dd10e4e959811d284249c4a46d083eda2df1155c06d7ce, content: '^8 This identity continues to be used by a minority throughout the former Yugoslav republics. The na...', meta: {'name': 'Slavs'}, embedding: vector of size 384)]], ground_truth_answers=['1523', 'gender roles and customs', 'Darwinism', '1530s and 1540s', 'the outer core and inner core', 'The Vedas', 'seventeen', 'subsequent long-run economic growth', 'a change of heart', 'Lebensraum'], rag_pipeline_inputs={'prompt_builder': {'question': ['In what year did Cortes send the first cochineal to Spain?', 'What issues may prevent women from working outside the home or receiving education?', 'What was a neologism expressing the introduction of new ideas via translation?', 'When were images being used to promote the spread of Lutheranism?', 'In the layered model of the Earth, the mantle has two layers below it. What are they? ', 'What ancient religious scriptures were among the first examples of Indian literature?', 'How many times was Tom shot?', 'What was persistent unemployment have a negative effect on?', 'What lesson did Johann von Staupitz teach Luther  repentance was?', 'What is the German word for living space?']}, 'answer_builder': {'query': ['In what year did Cortes send the first cochineal to Spain?', 'What issues may prevent women from working outside the home or receiving education?', 'What was a neologism expressing the introduction of new ideas via translation?', 'When were images being used to promote the spread of Lutheranism?', 'In the layered model of the Earth, the mantle has two layers below it. What are they? ', 'What ancient religious scriptures were among the first examples of Indian literature?', 'How many times was Tom shot?', 'What was persistent unemployment have a negative effect on?', 'What lesson did Johann von Staupitz teach Luther  repentance was?', 'What is the German word for living space?']}})\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Inspect the output of the evaluation run.\n",
+    "\n",
+    "print(f\"Results of the evaluation run: {emb_eval_run.results.run_name}\")\n",
+    "print(f\"Serialized RAG pipeline: {emb_eval_run.evaluated_pipeline}\")\n",
+    "print(f\"Serialized evaluation pipeline: {emb_eval_run.evaluation_pipeline}\")\n",
+    "print(f\"Inputs: {emb_eval_run.inputs}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluation score report:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>metrics</th>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>metric_doc_map</td>\n",
+       "      <td>0.25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>metric_doc_recall_single</td>\n",
+       "      <td>0.40</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>metric_faithfulness</td>\n",
+       "      <td>0.65</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    metrics  score\n",
+       "0            metric_doc_map   0.25\n",
+       "1  metric_doc_recall_single   0.40\n",
+       "2       metric_faithfulness   0.65"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(\"Evaluation score report:\")\n",
+    "emb_eval_run.results.score_report()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluation score dataframe:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>questions</th>\n",
+       "      <th>contexts</th>\n",
+       "      <th>responses</th>\n",
+       "      <th>ground_truth_answers</th>\n",
+       "      <th>ground_truth_documents</th>\n",
+       "      <th>metric_doc_map</th>\n",
+       "      <th>metric_doc_recall_single</th>\n",
+       "      <th>metric_faithfulness</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>In what year did Cortes send the first cochine...</td>\n",
+       "      <td>[The Spanish ship San Pedro and two other vess...</td>\n",
+       "      <td>The information provided does not mention Cort...</td>\n",
+       "      <td>1523</td>\n",
+       "      <td>[In the United States, political commentators ...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>What issues may prevent women from working out...</td>\n",
+       "      <td>[Children working at a young age has been a co...</td>\n",
+       "      <td>Some issues that may prevent women from workin...</td>\n",
+       "      <td>gender roles and customs</td>\n",
+       "      <td>[A number of researchers (David Rodda, Jacob V...</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>What was a neologism expressing the introducti...</td>\n",
+       "      <td>[After 1517, when the new invention of printin...</td>\n",
+       "      <td>The neologism expressing the introduction of n...</td>\n",
+       "      <td>Darwinism</td>\n",
+       "      <td>[After World War I, when Britain and France di...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>When were images being used to promote the spr...</td>\n",
+       "      <td>[The availability of the Bible in vernacular l...</td>\n",
+       "      <td>Images were used to promote the spread of Luth...</td>\n",
+       "      <td>1530s and 1540s</td>\n",
+       "      <td>[Luther wrote \"Ach Gott, vom Himmel sieh darei...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>In the layered model of the Earth, the mantle ...</td>\n",
+       "      <td>[The mantle is equivalent to 10 to 15 Earth ma...</td>\n",
+       "      <td>The two layers below the mantle in the Earth's...</td>\n",
+       "      <td>the outer core and inner core</td>\n",
+       "      <td>[The principle of inclusions and components st...</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>What ancient religious scriptures were among t...</td>\n",
+       "      <td>[Buddhist scriptures and other texts exist in ...</td>\n",
+       "      <td>The Vedas were among the first examples of Ind...</td>\n",
+       "      <td>The Vedas</td>\n",
+       "      <td>[Poetry is a form of literary art which uses a...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>How many times was Tom shot?</td>\n",
+       "      <td>[Hidalgo was turned over to the Bishop of Dura...</td>\n",
+       "      <td>There is no information provided about a perso...</td>\n",
+       "      <td>seventeen</td>\n",
+       "      <td>[Sergel's play toured in the UK starting at We...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>What was persistent unemployment have a negati...</td>\n",
+       "      <td>[As of September 2014, the greater Atlantic Ci...</td>\n",
+       "      <td>Persistent unemployment can have a negative ef...</td>\n",
+       "      <td>subsequent long-run economic growth</td>\n",
+       "      <td>[A number of researchers (David Rodda, Jacob V...</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>What lesson did Johann von Staupitz teach Luth...</td>\n",
+       "      <td>[His poor physical health made him short-tempe...</td>\n",
+       "      <td>Johann von Staupitz taught Luther that repenta...</td>\n",
+       "      <td>a change of heart</td>\n",
+       "      <td>[Luther wrote \"Ach Gott, vom Himmel sieh darei...</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>What is the German word for living space?</td>\n",
+       "      <td>[The term was created in 1920 by Hans Winkler,...</td>\n",
+       "      <td>The German word for living space is Lebensraum.</td>\n",
+       "      <td>Lebensraum</td>\n",
+       "      <td>[During World War II, Hitler's Generalplan Ost...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                           questions  \\\n",
+       "0  In what year did Cortes send the first cochine...   \n",
+       "1  What issues may prevent women from working out...   \n",
+       "2  What was a neologism expressing the introducti...   \n",
+       "3  When were images being used to promote the spr...   \n",
+       "4  In the layered model of the Earth, the mantle ...   \n",
+       "5  What ancient religious scriptures were among t...   \n",
+       "6                       How many times was Tom shot?   \n",
+       "7  What was persistent unemployment have a negati...   \n",
+       "8  What lesson did Johann von Staupitz teach Luth...   \n",
+       "9          What is the German word for living space?   \n",
+       "\n",
+       "                                            contexts  \\\n",
+       "0  [The Spanish ship San Pedro and two other vess...   \n",
+       "1  [Children working at a young age has been a co...   \n",
+       "2  [After 1517, when the new invention of printin...   \n",
+       "3  [The availability of the Bible in vernacular l...   \n",
+       "4  [The mantle is equivalent to 10 to 15 Earth ma...   \n",
+       "5  [Buddhist scriptures and other texts exist in ...   \n",
+       "6  [Hidalgo was turned over to the Bishop of Dura...   \n",
+       "7  [As of September 2014, the greater Atlantic Ci...   \n",
+       "8  [His poor physical health made him short-tempe...   \n",
+       "9  [The term was created in 1920 by Hans Winkler,...   \n",
+       "\n",
+       "                                           responses  \\\n",
+       "0  The information provided does not mention Cort...   \n",
+       "1  Some issues that may prevent women from workin...   \n",
+       "2  The neologism expressing the introduction of n...   \n",
+       "3  Images were used to promote the spread of Luth...   \n",
+       "4  The two layers below the mantle in the Earth's...   \n",
+       "5  The Vedas were among the first examples of Ind...   \n",
+       "6  There is no information provided about a perso...   \n",
+       "7  Persistent unemployment can have a negative ef...   \n",
+       "8  Johann von Staupitz taught Luther that repenta...   \n",
+       "9    The German word for living space is Lebensraum.   \n",
+       "\n",
+       "                  ground_truth_answers  \\\n",
+       "0                                 1523   \n",
+       "1             gender roles and customs   \n",
+       "2                            Darwinism   \n",
+       "3                      1530s and 1540s   \n",
+       "4        the outer core and inner core   \n",
+       "5                            The Vedas   \n",
+       "6                            seventeen   \n",
+       "7  subsequent long-run economic growth   \n",
+       "8                    a change of heart   \n",
+       "9                           Lebensraum   \n",
+       "\n",
+       "                              ground_truth_documents  metric_doc_map  \\\n",
+       "0  [In the United States, political commentators ...             0.0   \n",
+       "1  [A number of researchers (David Rodda, Jacob V...             0.5   \n",
+       "2  [After World War I, when Britain and France di...             0.0   \n",
+       "3  [Luther wrote \"Ach Gott, vom Himmel sieh darei...             0.0   \n",
+       "4  [The principle of inclusions and components st...             0.5   \n",
+       "5  [Poetry is a form of literary art which uses a...             0.0   \n",
+       "6  [Sergel's play toured in the UK starting at We...             0.0   \n",
+       "7  [A number of researchers (David Rodda, Jacob V...             0.5   \n",
+       "8  [Luther wrote \"Ach Gott, vom Himmel sieh darei...             1.0   \n",
+       "9  [During World War II, Hitler's Generalplan Ost...             0.0   \n",
+       "\n",
+       "   metric_doc_recall_single  metric_faithfulness  \n",
+       "0                       0.0                  0.0  \n",
+       "1                       1.0                  1.0  \n",
+       "2                       0.0                  0.0  \n",
+       "3                       0.0                  1.0  \n",
+       "4                       1.0                  1.0  \n",
+       "5                       0.0                  1.0  \n",
+       "6                       0.0                  0.0  \n",
+       "7                       1.0                  0.5  \n",
+       "8                       1.0                  1.0  \n",
+       "9                       0.0                  1.0  "
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(\"Evaluation score dataframe:\")\n",
+    "emb_eval_run.results.to_pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c2aa150209c147a095866717d8f094f4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7bac640b672d4691b498199441e85392",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "failed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 8 additional messages skipped\n",
+      "\u001b[2m2024-07-05T11:17:13.931323Z\u001b[0m [\u001b[31m\u001b[1merror    \u001b[0m] \u001b[1mfailed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 8 additional messages skipped\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m399\u001b[0m \u001b[36mmessage\u001b[0m=\u001b[35mfailed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 8 additional messages skipped\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mddtrace.internal.writer.writer\u001b[0m\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "729e2b4b826d49c8ac852d0291d5c5f4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3893580337e64316a0f471d9d661f524",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "84ae732dab7f4f5eab9cfb766f755460",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "63ae1ea85bc44e0d915b8874650fe361",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f3d5906dea1649fb96ff4191fba6c6a2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fddd1614c5dc4c57b68b7c75221cb6a4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "57c7adde0d634ba8816a123663c8ece8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b77d7e696d4f4e6caf669bdb48c23737",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 10/10 [00:14<00:00,  1.46s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Launch another evaluation run with the same inputs but with different overrides.\n",
+    "overrides = RAGEvaluationOverrides(rag_pipeline={\n",
+    "    \"generator\": {\"model\": \"gpt-4-turbo\"},\n",
+    "})\n",
+    "emb_eval_run_gpt4 = emb_eval_harness.run(inputs=eval_harness_input, run_name=\"emb_eval_run_gpt4\", overrides=overrides)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Comparison of the two evaluation runs:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>questions</th>\n",
+       "      <th>contexts</th>\n",
+       "      <th>responses</th>\n",
+       "      <th>ground_truth_answers</th>\n",
+       "      <th>ground_truth_documents</th>\n",
+       "      <th>emb_eval_run_metric_doc_map</th>\n",
+       "      <th>emb_eval_run_metric_doc_recall_single</th>\n",
+       "      <th>emb_eval_run_metric_faithfulness</th>\n",
+       "      <th>emb_eval_run_gpt4_metric_doc_map</th>\n",
+       "      <th>emb_eval_run_gpt4_metric_doc_recall_single</th>\n",
+       "      <th>emb_eval_run_gpt4_metric_faithfulness</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>In what year did Cortes send the first cochine...</td>\n",
+       "      <td>[The Spanish ship San Pedro and two other vess...</td>\n",
+       "      <td>The information provided does not mention Cort...</td>\n",
+       "      <td>1523</td>\n",
+       "      <td>[In the United States, political commentators ...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>What issues may prevent women from working out...</td>\n",
+       "      <td>[Children working at a young age has been a co...</td>\n",
+       "      <td>Some issues that may prevent women from workin...</td>\n",
+       "      <td>gender roles and customs</td>\n",
+       "      <td>[A number of researchers (David Rodda, Jacob V...</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>What was a neologism expressing the introducti...</td>\n",
+       "      <td>[After 1517, when the new invention of printin...</td>\n",
+       "      <td>The neologism expressing the introduction of n...</td>\n",
+       "      <td>Darwinism</td>\n",
+       "      <td>[After World War I, when Britain and France di...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>When were images being used to promote the spr...</td>\n",
+       "      <td>[The availability of the Bible in vernacular l...</td>\n",
+       "      <td>Images were used to promote the spread of Luth...</td>\n",
+       "      <td>1530s and 1540s</td>\n",
+       "      <td>[Luther wrote \"Ach Gott, vom Himmel sieh darei...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>In the layered model of the Earth, the mantle ...</td>\n",
+       "      <td>[The mantle is equivalent to 10 to 15 Earth ma...</td>\n",
+       "      <td>The two layers below the mantle in the Earth's...</td>\n",
+       "      <td>the outer core and inner core</td>\n",
+       "      <td>[The principle of inclusions and components st...</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>What ancient religious scriptures were among t...</td>\n",
+       "      <td>[Buddhist scriptures and other texts exist in ...</td>\n",
+       "      <td>The Vedas were among the first examples of Ind...</td>\n",
+       "      <td>The Vedas</td>\n",
+       "      <td>[Poetry is a form of literary art which uses a...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>How many times was Tom shot?</td>\n",
+       "      <td>[Hidalgo was turned over to the Bishop of Dura...</td>\n",
+       "      <td>There is no information provided about a perso...</td>\n",
+       "      <td>seventeen</td>\n",
+       "      <td>[Sergel's play toured in the UK starting at We...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>What was persistent unemployment have a negati...</td>\n",
+       "      <td>[As of September 2014, the greater Atlantic Ci...</td>\n",
+       "      <td>Persistent unemployment can have a negative ef...</td>\n",
+       "      <td>subsequent long-run economic growth</td>\n",
+       "      <td>[A number of researchers (David Rodda, Jacob V...</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>What lesson did Johann von Staupitz teach Luth...</td>\n",
+       "      <td>[His poor physical health made him short-tempe...</td>\n",
+       "      <td>Johann von Staupitz taught Luther that repenta...</td>\n",
+       "      <td>a change of heart</td>\n",
+       "      <td>[Luther wrote \"Ach Gott, vom Himmel sieh darei...</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>What is the German word for living space?</td>\n",
+       "      <td>[The term was created in 1920 by Hans Winkler,...</td>\n",
+       "      <td>The German word for living space is Lebensraum.</td>\n",
+       "      <td>Lebensraum</td>\n",
+       "      <td>[During World War II, Hitler's Generalplan Ost...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                           questions  \\\n",
+       "0  In what year did Cortes send the first cochine...   \n",
+       "1  What issues may prevent women from working out...   \n",
+       "2  What was a neologism expressing the introducti...   \n",
+       "3  When were images being used to promote the spr...   \n",
+       "4  In the layered model of the Earth, the mantle ...   \n",
+       "5  What ancient religious scriptures were among t...   \n",
+       "6                       How many times was Tom shot?   \n",
+       "7  What was persistent unemployment have a negati...   \n",
+       "8  What lesson did Johann von Staupitz teach Luth...   \n",
+       "9          What is the German word for living space?   \n",
+       "\n",
+       "                                            contexts  \\\n",
+       "0  [The Spanish ship San Pedro and two other vess...   \n",
+       "1  [Children working at a young age has been a co...   \n",
+       "2  [After 1517, when the new invention of printin...   \n",
+       "3  [The availability of the Bible in vernacular l...   \n",
+       "4  [The mantle is equivalent to 10 to 15 Earth ma...   \n",
+       "5  [Buddhist scriptures and other texts exist in ...   \n",
+       "6  [Hidalgo was turned over to the Bishop of Dura...   \n",
+       "7  [As of September 2014, the greater Atlantic Ci...   \n",
+       "8  [His poor physical health made him short-tempe...   \n",
+       "9  [The term was created in 1920 by Hans Winkler,...   \n",
+       "\n",
+       "                                           responses  \\\n",
+       "0  The information provided does not mention Cort...   \n",
+       "1  Some issues that may prevent women from workin...   \n",
+       "2  The neologism expressing the introduction of n...   \n",
+       "3  Images were used to promote the spread of Luth...   \n",
+       "4  The two layers below the mantle in the Earth's...   \n",
+       "5  The Vedas were among the first examples of Ind...   \n",
+       "6  There is no information provided about a perso...   \n",
+       "7  Persistent unemployment can have a negative ef...   \n",
+       "8  Johann von Staupitz taught Luther that repenta...   \n",
+       "9    The German word for living space is Lebensraum.   \n",
+       "\n",
+       "                  ground_truth_answers  \\\n",
+       "0                                 1523   \n",
+       "1             gender roles and customs   \n",
+       "2                            Darwinism   \n",
+       "3                      1530s and 1540s   \n",
+       "4        the outer core and inner core   \n",
+       "5                            The Vedas   \n",
+       "6                            seventeen   \n",
+       "7  subsequent long-run economic growth   \n",
+       "8                    a change of heart   \n",
+       "9                           Lebensraum   \n",
+       "\n",
+       "                              ground_truth_documents  \\\n",
+       "0  [In the United States, political commentators ...   \n",
+       "1  [A number of researchers (David Rodda, Jacob V...   \n",
+       "2  [After World War I, when Britain and France di...   \n",
+       "3  [Luther wrote \"Ach Gott, vom Himmel sieh darei...   \n",
+       "4  [The principle of inclusions and components st...   \n",
+       "5  [Poetry is a form of literary art which uses a...   \n",
+       "6  [Sergel's play toured in the UK starting at We...   \n",
+       "7  [A number of researchers (David Rodda, Jacob V...   \n",
+       "8  [Luther wrote \"Ach Gott, vom Himmel sieh darei...   \n",
+       "9  [During World War II, Hitler's Generalplan Ost...   \n",
+       "\n",
+       "   emb_eval_run_metric_doc_map  emb_eval_run_metric_doc_recall_single  \\\n",
+       "0                          0.0                                    0.0   \n",
+       "1                          0.5                                    1.0   \n",
+       "2                          0.0                                    0.0   \n",
+       "3                          0.0                                    0.0   \n",
+       "4                          0.5                                    1.0   \n",
+       "5                          0.0                                    0.0   \n",
+       "6                          0.0                                    0.0   \n",
+       "7                          0.5                                    1.0   \n",
+       "8                          1.0                                    1.0   \n",
+       "9                          0.0                                    0.0   \n",
+       "\n",
+       "   emb_eval_run_metric_faithfulness  emb_eval_run_gpt4_metric_doc_map  \\\n",
+       "0                               0.0                               0.0   \n",
+       "1                               1.0                               0.5   \n",
+       "2                               0.0                               0.0   \n",
+       "3                               1.0                               0.0   \n",
+       "4                               1.0                               0.5   \n",
+       "5                               1.0                               0.0   \n",
+       "6                               0.0                               0.0   \n",
+       "7                               0.5                               0.5   \n",
+       "8                               1.0                               1.0   \n",
+       "9                               1.0                               0.0   \n",
+       "\n",
+       "   emb_eval_run_gpt4_metric_doc_recall_single  \\\n",
+       "0                                         0.0   \n",
+       "1                                         1.0   \n",
+       "2                                         0.0   \n",
+       "3                                         0.0   \n",
+       "4                                         1.0   \n",
+       "5                                         0.0   \n",
+       "6                                         0.0   \n",
+       "7                                         1.0   \n",
+       "8                                         1.0   \n",
+       "9                                         0.0   \n",
+       "\n",
+       "   emb_eval_run_gpt4_metric_faithfulness  \n",
+       "0                                    1.0  \n",
+       "1                                    1.0  \n",
+       "2                                    1.0  \n",
+       "3                                    1.0  \n",
+       "4                                    1.0  \n",
+       "5                                    1.0  \n",
+       "6                                    1.0  \n",
+       "7                                    1.0  \n",
+       "8                                    1.0  \n",
+       "9                                    0.0  "
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Compare the results of the two evaluation runs.\n",
+    "print(\"Comparison of the two evaluation runs:\")\n",
+    "emb_eval_run.results.comparative_individual_scores_report(emb_eval_run_gpt4.results)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the above code, we've primarily focused on using the `default_xxx` methods of the `RAGEvaluationHarness` class. They provide a straightforward way of getting started with the evaluation of simple RAG pipelines which use prototypical components. The harness can also be used to evaluate arbitrarily complex RAG pipelines. This is done by providing the harness with some extra metadata about the pipeline to be evaluated.\n",
+    "\n",
+    "To use an arbitrary pipeline with the harness, the latter requires information about the following components (c.f `RAGExpectedComponent`):\n",
+    "- Query processor - Component that processes the input query. \n",
+    "    - Expects one input that contains the query string.\n",
+    "- Document retriever - Component that retrieves documents based on the input query.\n",
+    "    - Expects one output that contains the retrieved documents.\n",
+    "- Response generator - Component that generates responses based on the query and the retrieved documents.\n",
+    "    - Expects one output that contains the LLM's response(s).\n",
+    "\n",
+    "For each of the above, the user needs to provide the following metadata (c.f `RAGExpectedComponentMetadata`):\n",
+    "- The name of the component as seen in the pipeline.\n",
+    "- A mapping of the component's expected inputs to their corresponding input names.\n",
+    "- A mapping of the component's expected outputs to their corresponding output names.\n",
+    "\n",
+    "For example, let's consider `RAGExpectedComponent.QUERY_PROCESSOR`: Assume we have a RAG pipeline with an [`OpenAITextEmbedder`](https://github.com/deepset-ai/haystack/blob/0ceeb733baabe2b3658ee7065c4441a632ef465d/haystack/components/embedders/openai_text_embedder.py#L18) component called `\"txt_embedder\"`. Since the harness is responsible for passing the pipeline's input (the query) to the `OpenAITextEmbedder`, it needs to know the name of the component. Furthermore, it also needs to know the [name of `OpenAITextEmbedder`'s input](https://github.com/deepset-ai/haystack/blob/0ceeb733baabe2b3658ee7065c4441a632ef465d/haystack/components/embedders/openai_text_embedder.py#L135) through which the query should be supplied. The metadata for the above looks thus:\n",
+    "```python\n",
+    "query_processor_metadata = RAGExpectedComponentMetadata(\n",
+    "    name=\"txt_embedder\",\n",
+    "    input_mapping={\n",
+    "        \"query\": \"text\"\n",
+    "    }\n",
+    ")\n",
+    "```\n",
+    "Similarly, for `RAGExpectedComponent.DOCUMENT_RETRIEVER`: Assume the RAG pipeline has an [`InMemoryEmbeddingRetriever`](https://github.com/deepset-ai/haystack/blob/0ceeb733baabe2b3658ee7065c4441a632ef465d/haystack/components/retrievers/in_memory/embedding_retriever.py#L12) component named `\"mem_retriever\"` and is connected to `\"txt_embedder\"`.\n",
+    "```python\n",
+    "document_retriever_metadata = RAGExpectedComponentMetadata(\n",
+    "    name=\"mem_retriever\",\n",
+    "    output_mapping={\n",
+    "        \"retrieved_documents\": \"documents\"\n",
+    "    }\n",
+    ")\n",
+    "```\n",
+    "Both `\"query\"` and `\"retrieved_documents\"` are \"meta\" identifiers used by the harness to specify expected inputs and outputs - They are specific to each `RAGExpectedComponent` enum variant and are documented in their docstrings."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a harness to evalaute a custom RAG pipeline.\n",
+    "# Commented out because the pipeline is not defined in this notebook.\n",
+    "\n",
+    "# custom_eval_harness = RAGEvaluationHarness(\n",
+    "#     rag_pipeline=custom_rag_pipeline,\n",
+    "#     rag_components={\n",
+    "#         RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(\n",
+    "#             \"query_embedder\", input_mapping={\"query\": \"text\"}\n",
+    "#         ),\n",
+    "#         RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata(\n",
+    "#             \"retriever\",\n",
+    "#             output_mapping={\"retrieved_documents\": \"documents\"},\n",
+    "#         ),\n",
+    "#         RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata(\n",
+    "#             \"generator\", output_mapping={\"replies\": \"replies\"}\n",
+    "#         ),\n",
+    "#     },\n",
+    "#     metrics={\n",
+    "#         RAGEvaluationMetric.DOCUMENT_MAP,\n",
+    "#         RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,\n",
+    "#         RAGEvaluationMetric.ANSWER_FAITHFULNESS\n",
+    "#     })"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There is no strict requirement when it comes which components can act as a query processor, a document retriever or a response generator. For instance, it's perfecty fine if the query processor and the document retriever are the same component. In fact, this is the case when using a keyword-based retriever which directly accepts the query (as opposed to having a query embedder in front of it)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}