From 81f1bbaf989d41b42af0fba16364f040d9cfb399 Mon Sep 17 00:00:00 2001 From: shadeMe Date: Fri, 5 Jul 2024 12:26:07 +0200 Subject: [PATCH] feat: Add notebook for RAG eval harness --- examples/rag_eval_harness.ipynb | 1678 +++++++++++++++++++++++++++++++ 1 file changed, 1678 insertions(+) create mode 100644 examples/rag_eval_harness.ipynb diff --git a/examples/rag_eval_harness.ipynb b/examples/rag_eval_harness.ipynb new file mode 100644 index 00000000..530f9537 --- /dev/null +++ b/examples/rag_eval_harness.ipynb @@ -0,0 +1,1678 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "# We assume that the haystack-experimental package is already installed.\n", + "pip install datasets\n", + "pip install sentence-transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's set the OpenAI API key environment variable to ensure that\n", + "# LLM-based evaluators can query the OpenAI API.\n", + "import os\n", + "from getpass import getpass\n", + "if \"OPENAI_API_KEY\" not in os.environ:\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'NoneType' object has no attribute 'cadam32bit_grad_fp32'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mkannan/.pyenv/versions/3.10.13/envs/dev/lib/python3.10/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n", + " warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n" + ] + } + ], + "source": [ + "# All the imports that we'll need to create the following:\n", + "# - An indexing pipeline that stores documents from our chosen dataset in a document store.\n", + "# - A retrieval pipeline that uses a query to retrieve relevant documents from the document store.\n", + "import json\n", + "from typing import List, Dict\n", + "from collections import defaultdict\n", + "from pathlib import Path\n", + "import random\n", + "from datasets import load_dataset, Dataset\n", + "from tqdm import tqdm\n", + "import shutil\n", + "\n", + "from haystack import Document, Pipeline\n", + "from haystack.components.builders import AnswerBuilder, PromptBuilder\n", + "from haystack.components.embedders import (\n", + " SentenceTransformersDocumentEmbedder,\n", + " SentenceTransformersTextEmbedder,\n", + ")\n", + "from haystack.components.generators import OpenAIGenerator\n", + "from haystack.components.retrievers import (\n", + " InMemoryEmbeddingRetriever,\n", + " InMemoryBM25Retriever,\n", + ")\n", + "from haystack.components.writers import DocumentWriter\n", + "\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "from haystack.document_stores.types import DuplicatePolicy, DocumentStore\n", + "from haystack_experimental.evaluation.harness.rag import (\n", + " DefaultRAGArchitecture,\n", + " RAGEvaluationHarness,\n", + " RAGEvaluationMetric,\n", + " RAGEvaluationInput,\n", + " RAGEvaluationOutput,\n", + " RAGEvaluationOverrides,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset preparation\n", + "\n", + "The following steps will load the SQUAD dataset, preprocess them for the indexing pipeline and store them to a local folder in the current working directory." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Helper functions to load the SQUAD dataset.\n", + "def aggregate_wiki_title(data: Dataset, agg_wiki_title: Dict[str, Dict[str, List[str]]]):\n", + " for idx, x in enumerate(data.iter(batch_size=1)):\n", + " if x[\"context\"] not in agg_wiki_title[x[\"title\"][0]][\"context\"]:\n", + " agg_wiki_title[x[\"title\"][0]][\"context\"].append(x[\"context\"])\n", + " agg_wiki_title[x[\"title\"][0]][\"question_answers\"].append(\n", + " {\"question\": x[\"question\"], \"answers\": x[\"answers\"]}\n", + " )\n", + "\n", + "def load_transformed_squad():\n", + " with open(\"transformed_squad/questions.jsonl\", \"r\") as f:\n", + " questions = [json.loads(x) for x in f.readlines()]\n", + " for idx, question in enumerate(questions):\n", + " question[\"query_id\"] = f\"query_{idx}\"\n", + "\n", + " def create_document(text: str, name: str):\n", + " return Document(content=text, meta={\"name\": name})\n", + "\n", + " # walk through the files in the directory and transform each text file into a Document\n", + " documents = []\n", + " for root, dirs, files in os.walk(\"transformed_squad/articles/\"):\n", + " for article in files:\n", + " with open(f\"{root}/{article}\", \"r\") as f:\n", + " raw_texts = f.read().split(\"\\n\")\n", + " for text in raw_texts:\n", + " documents.append(\n", + " create_document(text, article.replace(\".txt\", \"\"))\n", + " )\n", + "\n", + " return questions, documents" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 490/490 [00:00<00:00, 57035.27it/s]\n", + "100%|██████████| 490/490 [00:00<00:00, 9517.10it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".\n" + ] + } + ], + "source": [ + "data_train = load_dataset(\"squad\", split=\"train\")\n", + "data_validation = load_dataset(\"squad\", split=\"validation\")\n", + "agg_wiki_title = defaultdict(\n", + " lambda: {\"context\": [], \"question_answers\": [], \"text\": \"\"}\n", + ")\n", + "aggregate_wiki_title(data_train, agg_wiki_title)\n", + "aggregate_wiki_title(data_validation, agg_wiki_title)\n", + "\n", + "# merge the context into a single document\n", + "for article in tqdm(agg_wiki_title.keys()):\n", + " agg_wiki_title[article][\"text\"] = \"\\n\".join(\n", + " [x[0] for x in agg_wiki_title[article][\"context\"]]\n", + " )\n", + "\n", + "# create documents\n", + "for article in tqdm(agg_wiki_title.keys()):\n", + " out_path = Path(\"transformed_squad/articles/\")\n", + " out_path.mkdir(parents=True, exist_ok=True)\n", + " with open(f\"{str(out_path)}/{article}.txt\", \"w\") as f:\n", + " f.write(agg_wiki_title[article][\"text\"])\n", + "\n", + "# create question/answers\n", + "questions = Path(\"transformed_squad/\")\n", + "questions.mkdir(parents=True, exist_ok=True)\n", + "with open(f\"{str(questions)}/questions.jsonl\", \"w\") as f:\n", + " for article in agg_wiki_title.keys():\n", + " for entry in agg_wiki_title[article][\"question_answers\"]:\n", + " f.write(\n", + " json.dumps(\n", + " {\n", + " \"question\": entry[\"question\"][0],\n", + " \"document\": article,\n", + " \"answers\": entry[\"answers\"][0],\n", + " }\n", + " )\n", + " + \"\\n\"\n", + " )\n", + "\n", + "questions, documents = load_transformed_squad()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Indexing pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Helper function to create a pipeline that indexes the documents in the document store.\n", + "def indexing(documents: List[Document]) -> InMemoryDocumentStore:\n", + " document_store = InMemoryDocumentStore()\n", + "\n", + " doc_writer = DocumentWriter(\n", + " document_store=document_store, policy=DuplicatePolicy.SKIP\n", + " )\n", + " doc_embedder = SentenceTransformersDocumentEmbedder(\n", + " model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + " )\n", + "\n", + " ingestion_pipe = Pipeline()\n", + " ingestion_pipe.add_component(instance=doc_embedder, name=\"doc_embedder\")\n", + " ingestion_pipe.add_component(instance=doc_writer, name=\"doc_writer\")\n", + "\n", + " ingestion_pipe.connect(\"doc_embedder.documents\", \"doc_writer.documents\")\n", + " ingestion_pipe.run({\"doc_embedder\": {\"documents\": documents}})\n", + "\n", + " return document_store" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mkannan/.pyenv/versions/3.10.13/envs/dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1dd82506edbf4c7dbf0f4f1cda0d7902", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/32 [00:00 Pipeline:\n", + " template = \"\"\"\n", + " You have to answer the following question based on the given context information only.\n", + "\n", + " Context:\n", + " {% for document in documents %}\n", + " {{ document.content }}\n", + " {% endfor %}\n", + "\n", + " Question: {{question}}\n", + " Answer:\n", + " \"\"\"\n", + "\n", + " pipeline = Pipeline()\n", + " pipeline.add_component(\n", + " \"query_embedder\",\n", + " SentenceTransformersTextEmbedder(\n", + " model=\"sentence-transformers/all-MiniLM-L6-v2\",\n", + " progress_bar=False,\n", + " ),\n", + " )\n", + " pipeline.add_component(\n", + " \"retriever\", InMemoryEmbeddingRetriever(document_store, top_k=top_k)\n", + " )\n", + " pipeline.add_component(\"prompt_builder\", PromptBuilder(template=template))\n", + " pipeline.add_component(\n", + " \"generator\", OpenAIGenerator(model=\"gpt-3.5-turbo\")\n", + " )\n", + " pipeline.add_component(\"answer_builder\", AnswerBuilder())\n", + "\n", + " pipeline.connect(\"query_embedder\", \"retriever.query_embedding\")\n", + " pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n", + " pipeline.connect(\"prompt_builder\", \"generator\")\n", + " pipeline.connect(\"generator.replies\", \"answer_builder.replies\")\n", + " pipeline.connect(\"generator.meta\", \"answer_builder.meta\")\n", + " pipeline.connect(\"retriever\", \"answer_builder.documents\")\n", + "\n", + " return pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Helper function to create an keyword-based RAG pipeline.\n", + "def build_keyword_rag_pipeline(document_store: InMemoryDocumentStore, top_k: int = 2) -> Pipeline:\n", + " template = \"\"\"\n", + " You have to answer the following question based on the given context information only.\n", + "\n", + " Context:\n", + " {% for document in documents %}\n", + " {{ document.content }}\n", + " {% endfor %}\n", + "\n", + " Question: {{question}}\n", + " Answer:\n", + " \"\"\"\n", + "\n", + " pipeline = Pipeline()\n", + " pipeline.add_component(\n", + " \"retriever\", InMemoryBM25Retriever(document_store, top_k=top_k)\n", + " )\n", + " pipeline.add_component(\"prompt_builder\", PromptBuilder(template=template))\n", + " pipeline.add_component(\n", + " \"generator\", OpenAIGenerator(model=\"gpt-3.5-turbo\")\n", + " )\n", + " pipeline.add_component(\"answer_builder\", AnswerBuilder())\n", + "\n", + " pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n", + " pipeline.connect(\"prompt_builder\", \"generator\")\n", + " pipeline.connect(\"generator.replies\", \"answer_builder.replies\")\n", + " pipeline.connect(\"generator.meta\", \"answer_builder.meta\")\n", + " pipeline.connect(\"retriever\", \"answer_builder.documents\")\n", + "\n", + " return pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "emb_rag_pipeline = build_emb_rag_pipeline(document_store, top_k=2)\n", + "keyword_rag_pipeline = build_keyword_rag_pipeline(document_store, top_k=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation harness\n", + "\n", + "The RAG evaluation harness comes with a predefined set of evaluation metrics, which are enumerated in the `RAGEvaluationMetric` enum. \n", + "\n", + "TODO: talk about the default rag architectures, etc" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a harness to evalaute the embedding-based RAG pipeline.\n", + "emb_eval_harness = RAGEvaluationHarness(emb_rag_pipeline, \n", + " rag_components=DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL,\n", + " metrics={\n", + " RAGEvaluationMetric.DOCUMENT_MAP,\n", + " RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,\n", + " RAGEvaluationMetric.FAITHFULNESS\n", + " })\n", + "keyword_eval_harness = RAGEvaluationHarness(keyword_rag_pipeline,\n", + " rag_components=DefaultRAGArchitecture.GENERATION_WITH_KEYWORD_RETRIEVAL,\n", + " metrics={\n", + " RAGEvaluationMetric.DOCUMENT_MAP,\n", + " RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,\n", + " RAGEvaluationMetric.FAITHFULNESS\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the inputs to the evaluation harness.\n", + "# These inputs will be automatically passed to RAG pipeline \n", + "# and the evaluation pipeline that the harness internally uses.\n", + "\n", + "input_questions = random.sample(questions, 10)\n", + "\n", + "eval_harness_input = RAGEvaluationInput(\n", + " queries=[q[\"question\"] for q in input_questions],\n", + " ground_truth_answers=[q[\"answers\"][\"text\"][0] for q in input_questions],\n", + " ground_truth_documents=[\n", + " [\n", + " doc\n", + " for doc in document_store.storage.values()\n", + " if doc.meta[\"name\"] == q[\"document\"]\n", + " ]\n", + " for q in input_questions\n", + " ],\n", + " rag_pipeline_inputs={\n", + " \"prompt_builder\": {\"question\": [q[\"question\"] for q in input_questions]},\n", + " \"answer_builder\": {\"query\": [q[\"question\"] for q in input_questions]},\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b6d13904c0a84eb7b8598bd7a78c75b1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metricsscore
0metric_doc_map0.25
1metric_doc_recall_single0.40
2metric_faithfulness0.65
\n", + "" + ], + "text/plain": [ + " metrics score\n", + "0 metric_doc_map 0.25\n", + "1 metric_doc_recall_single 0.40\n", + "2 metric_faithfulness 0.65" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Evaluation score report:\")\n", + "emb_eval_run.results.score_report()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluation score dataframe:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionscontextsresponsesground_truth_answersground_truth_documentsmetric_doc_mapmetric_doc_recall_singlemetric_faithfulness
0In what year did Cortes send the first cochine...[The Spanish ship San Pedro and two other vess...The information provided does not mention Cort...1523[In the United States, political commentators ...0.00.00.0
1What issues may prevent women from working out...[Children working at a young age has been a co...Some issues that may prevent women from workin...gender roles and customs[A number of researchers (David Rodda, Jacob V...0.51.01.0
2What was a neologism expressing the introducti...[After 1517, when the new invention of printin...The neologism expressing the introduction of n...Darwinism[After World War I, when Britain and France di...0.00.00.0
3When were images being used to promote the spr...[The availability of the Bible in vernacular l...Images were used to promote the spread of Luth...1530s and 1540s[Luther wrote \"Ach Gott, vom Himmel sieh darei...0.00.01.0
4In the layered model of the Earth, the mantle ...[The mantle is equivalent to 10 to 15 Earth ma...The two layers below the mantle in the Earth's...the outer core and inner core[The principle of inclusions and components st...0.51.01.0
5What ancient religious scriptures were among t...[Buddhist scriptures and other texts exist in ...The Vedas were among the first examples of Ind...The Vedas[Poetry is a form of literary art which uses a...0.00.01.0
6How many times was Tom shot?[Hidalgo was turned over to the Bishop of Dura...There is no information provided about a perso...seventeen[Sergel's play toured in the UK starting at We...0.00.00.0
7What was persistent unemployment have a negati...[As of September 2014, the greater Atlantic Ci...Persistent unemployment can have a negative ef...subsequent long-run economic growth[A number of researchers (David Rodda, Jacob V...0.51.00.5
8What lesson did Johann von Staupitz teach Luth...[His poor physical health made him short-tempe...Johann von Staupitz taught Luther that repenta...a change of heart[Luther wrote \"Ach Gott, vom Himmel sieh darei...1.01.01.0
9What is the German word for living space?[The term was created in 1920 by Hans Winkler,...The German word for living space is Lebensraum.Lebensraum[During World War II, Hitler's Generalplan Ost...0.00.01.0
\n", + "
" + ], + "text/plain": [ + " questions \\\n", + "0 In what year did Cortes send the first cochine... \n", + "1 What issues may prevent women from working out... \n", + "2 What was a neologism expressing the introducti... \n", + "3 When were images being used to promote the spr... \n", + "4 In the layered model of the Earth, the mantle ... \n", + "5 What ancient religious scriptures were among t... \n", + "6 How many times was Tom shot? \n", + "7 What was persistent unemployment have a negati... \n", + "8 What lesson did Johann von Staupitz teach Luth... \n", + "9 What is the German word for living space? \n", + "\n", + " contexts \\\n", + "0 [The Spanish ship San Pedro and two other vess... \n", + "1 [Children working at a young age has been a co... \n", + "2 [After 1517, when the new invention of printin... \n", + "3 [The availability of the Bible in vernacular l... \n", + "4 [The mantle is equivalent to 10 to 15 Earth ma... \n", + "5 [Buddhist scriptures and other texts exist in ... \n", + "6 [Hidalgo was turned over to the Bishop of Dura... \n", + "7 [As of September 2014, the greater Atlantic Ci... \n", + "8 [His poor physical health made him short-tempe... \n", + "9 [The term was created in 1920 by Hans Winkler,... \n", + "\n", + " responses \\\n", + "0 The information provided does not mention Cort... \n", + "1 Some issues that may prevent women from workin... \n", + "2 The neologism expressing the introduction of n... \n", + "3 Images were used to promote the spread of Luth... \n", + "4 The two layers below the mantle in the Earth's... \n", + "5 The Vedas were among the first examples of Ind... \n", + "6 There is no information provided about a perso... \n", + "7 Persistent unemployment can have a negative ef... \n", + "8 Johann von Staupitz taught Luther that repenta... \n", + "9 The German word for living space is Lebensraum. \n", + "\n", + " ground_truth_answers \\\n", + "0 1523 \n", + "1 gender roles and customs \n", + "2 Darwinism \n", + "3 1530s and 1540s \n", + "4 the outer core and inner core \n", + "5 The Vedas \n", + "6 seventeen \n", + "7 subsequent long-run economic growth \n", + "8 a change of heart \n", + "9 Lebensraum \n", + "\n", + " ground_truth_documents metric_doc_map \\\n", + "0 [In the United States, political commentators ... 0.0 \n", + "1 [A number of researchers (David Rodda, Jacob V... 0.5 \n", + "2 [After World War I, when Britain and France di... 0.0 \n", + "3 [Luther wrote \"Ach Gott, vom Himmel sieh darei... 0.0 \n", + "4 [The principle of inclusions and components st... 0.5 \n", + "5 [Poetry is a form of literary art which uses a... 0.0 \n", + "6 [Sergel's play toured in the UK starting at We... 0.0 \n", + "7 [A number of researchers (David Rodda, Jacob V... 0.5 \n", + "8 [Luther wrote \"Ach Gott, vom Himmel sieh darei... 1.0 \n", + "9 [During World War II, Hitler's Generalplan Ost... 0.0 \n", + "\n", + " metric_doc_recall_single metric_faithfulness \n", + "0 0.0 0.0 \n", + "1 1.0 1.0 \n", + "2 0.0 0.0 \n", + "3 0.0 1.0 \n", + "4 1.0 1.0 \n", + "5 0.0 1.0 \n", + "6 0.0 0.0 \n", + "7 1.0 0.5 \n", + "8 1.0 1.0 \n", + "9 0.0 1.0 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Evaluation score dataframe:\")\n", + "emb_eval_run.results.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c2aa150209c147a095866717d8f094f4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionscontextsresponsesground_truth_answersground_truth_documentsemb_eval_run_metric_doc_mapemb_eval_run_metric_doc_recall_singleemb_eval_run_metric_faithfulnessemb_eval_run_gpt4_metric_doc_mapemb_eval_run_gpt4_metric_doc_recall_singleemb_eval_run_gpt4_metric_faithfulness
0In what year did Cortes send the first cochine...[The Spanish ship San Pedro and two other vess...The information provided does not mention Cort...1523[In the United States, political commentators ...0.00.00.00.00.01.0
1What issues may prevent women from working out...[Children working at a young age has been a co...Some issues that may prevent women from workin...gender roles and customs[A number of researchers (David Rodda, Jacob V...0.51.01.00.51.01.0
2What was a neologism expressing the introducti...[After 1517, when the new invention of printin...The neologism expressing the introduction of n...Darwinism[After World War I, when Britain and France di...0.00.00.00.00.01.0
3When were images being used to promote the spr...[The availability of the Bible in vernacular l...Images were used to promote the spread of Luth...1530s and 1540s[Luther wrote \"Ach Gott, vom Himmel sieh darei...0.00.01.00.00.01.0
4In the layered model of the Earth, the mantle ...[The mantle is equivalent to 10 to 15 Earth ma...The two layers below the mantle in the Earth's...the outer core and inner core[The principle of inclusions and components st...0.51.01.00.51.01.0
5What ancient religious scriptures were among t...[Buddhist scriptures and other texts exist in ...The Vedas were among the first examples of Ind...The Vedas[Poetry is a form of literary art which uses a...0.00.01.00.00.01.0
6How many times was Tom shot?[Hidalgo was turned over to the Bishop of Dura...There is no information provided about a perso...seventeen[Sergel's play toured in the UK starting at We...0.00.00.00.00.01.0
7What was persistent unemployment have a negati...[As of September 2014, the greater Atlantic Ci...Persistent unemployment can have a negative ef...subsequent long-run economic growth[A number of researchers (David Rodda, Jacob V...0.51.00.50.51.01.0
8What lesson did Johann von Staupitz teach Luth...[His poor physical health made him short-tempe...Johann von Staupitz taught Luther that repenta...a change of heart[Luther wrote \"Ach Gott, vom Himmel sieh darei...1.01.01.01.01.01.0
9What is the German word for living space?[The term was created in 1920 by Hans Winkler,...The German word for living space is Lebensraum.Lebensraum[During World War II, Hitler's Generalplan Ost...0.00.01.00.00.00.0
\n", + "" + ], + "text/plain": [ + " questions \\\n", + "0 In what year did Cortes send the first cochine... \n", + "1 What issues may prevent women from working out... \n", + "2 What was a neologism expressing the introducti... \n", + "3 When were images being used to promote the spr... \n", + "4 In the layered model of the Earth, the mantle ... \n", + "5 What ancient religious scriptures were among t... \n", + "6 How many times was Tom shot? \n", + "7 What was persistent unemployment have a negati... \n", + "8 What lesson did Johann von Staupitz teach Luth... \n", + "9 What is the German word for living space? \n", + "\n", + " contexts \\\n", + "0 [The Spanish ship San Pedro and two other vess... \n", + "1 [Children working at a young age has been a co... \n", + "2 [After 1517, when the new invention of printin... \n", + "3 [The availability of the Bible in vernacular l... \n", + "4 [The mantle is equivalent to 10 to 15 Earth ma... \n", + "5 [Buddhist scriptures and other texts exist in ... \n", + "6 [Hidalgo was turned over to the Bishop of Dura... \n", + "7 [As of September 2014, the greater Atlantic Ci... \n", + "8 [His poor physical health made him short-tempe... \n", + "9 [The term was created in 1920 by Hans Winkler,... \n", + "\n", + " responses \\\n", + "0 The information provided does not mention Cort... \n", + "1 Some issues that may prevent women from workin... \n", + "2 The neologism expressing the introduction of n... \n", + "3 Images were used to promote the spread of Luth... \n", + "4 The two layers below the mantle in the Earth's... \n", + "5 The Vedas were among the first examples of Ind... \n", + "6 There is no information provided about a perso... \n", + "7 Persistent unemployment can have a negative ef... \n", + "8 Johann von Staupitz taught Luther that repenta... \n", + "9 The German word for living space is Lebensraum. \n", + "\n", + " ground_truth_answers \\\n", + "0 1523 \n", + "1 gender roles and customs \n", + "2 Darwinism \n", + "3 1530s and 1540s \n", + "4 the outer core and inner core \n", + "5 The Vedas \n", + "6 seventeen \n", + "7 subsequent long-run economic growth \n", + "8 a change of heart \n", + "9 Lebensraum \n", + "\n", + " ground_truth_documents \\\n", + "0 [In the United States, political commentators ... \n", + "1 [A number of researchers (David Rodda, Jacob V... \n", + "2 [After World War I, when Britain and France di... \n", + "3 [Luther wrote \"Ach Gott, vom Himmel sieh darei... \n", + "4 [The principle of inclusions and components st... \n", + "5 [Poetry is a form of literary art which uses a... \n", + "6 [Sergel's play toured in the UK starting at We... \n", + "7 [A number of researchers (David Rodda, Jacob V... \n", + "8 [Luther wrote \"Ach Gott, vom Himmel sieh darei... \n", + "9 [During World War II, Hitler's Generalplan Ost... \n", + "\n", + " emb_eval_run_metric_doc_map emb_eval_run_metric_doc_recall_single \\\n", + "0 0.0 0.0 \n", + "1 0.5 1.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.5 1.0 \n", + "5 0.0 0.0 \n", + "6 0.0 0.0 \n", + "7 0.5 1.0 \n", + "8 1.0 1.0 \n", + "9 0.0 0.0 \n", + "\n", + " emb_eval_run_metric_faithfulness emb_eval_run_gpt4_metric_doc_map \\\n", + "0 0.0 0.0 \n", + "1 1.0 0.5 \n", + "2 0.0 0.0 \n", + "3 1.0 0.0 \n", + "4 1.0 0.5 \n", + "5 1.0 0.0 \n", + "6 0.0 0.0 \n", + "7 0.5 0.5 \n", + "8 1.0 1.0 \n", + "9 1.0 0.0 \n", + "\n", + " emb_eval_run_gpt4_metric_doc_recall_single \\\n", + "0 0.0 \n", + "1 1.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 1.0 \n", + "5 0.0 \n", + "6 0.0 \n", + "7 1.0 \n", + "8 1.0 \n", + "9 0.0 \n", + "\n", + " emb_eval_run_gpt4_metric_faithfulness \n", + "0 1.0 \n", + "1 1.0 \n", + "2 1.0 \n", + "3 1.0 \n", + "4 1.0 \n", + "5 1.0 \n", + "6 1.0 \n", + "7 1.0 \n", + "8 1.0 \n", + "9 0.0 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Compare the results of the two evaluation runs.\n", + "print(\"Comparison of the two evaluation runs:\")\n", + "emb_eval_run.results.comparative_individual_scores_report(emb_eval_run_gpt4.results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the above code, we've primarily focused on using the `default_xxx` methods of the `RAGEvaluationHarness` class. They provide a straightforward way of getting started with the evaluation of simple RAG pipelines which use prototypical components. The harness can also be used to evaluate arbitrarily complex RAG pipelines. This is done by providing the harness with some extra metadata about the pipeline to be evaluated.\n", + "\n", + "To use an arbitrary pipeline with the harness, the latter requires information about the following components (c.f `RAGExpectedComponent`):\n", + "- Query processor - Component that processes the input query. \n", + " - Expects one input that contains the query string.\n", + "- Document retriever - Component that retrieves documents based on the input query.\n", + " - Expects one output that contains the retrieved documents.\n", + "- Response generator - Component that generates responses based on the query and the retrieved documents.\n", + " - Expects one output that contains the LLM's response(s).\n", + "\n", + "For each of the above, the user needs to provide the following metadata (c.f `RAGExpectedComponentMetadata`):\n", + "- The name of the component as seen in the pipeline.\n", + "- A mapping of the component's expected inputs to their corresponding input names.\n", + "- A mapping of the component's expected outputs to their corresponding output names.\n", + "\n", + "For example, let's consider `RAGExpectedComponent.QUERY_PROCESSOR`: Assume we have a RAG pipeline with an [`OpenAITextEmbedder`](https://github.com/deepset-ai/haystack/blob/0ceeb733baabe2b3658ee7065c4441a632ef465d/haystack/components/embedders/openai_text_embedder.py#L18) component called `\"txt_embedder\"`. Since the harness is responsible for passing the pipeline's input (the query) to the `OpenAITextEmbedder`, it needs to know the name of the component. Furthermore, it also needs to know the [name of `OpenAITextEmbedder`'s input](https://github.com/deepset-ai/haystack/blob/0ceeb733baabe2b3658ee7065c4441a632ef465d/haystack/components/embedders/openai_text_embedder.py#L135) through which the query should be supplied. The metadata for the above looks thus:\n", + "```python\n", + "query_processor_metadata = RAGExpectedComponentMetadata(\n", + " name=\"txt_embedder\",\n", + " input_mapping={\n", + " \"query\": \"text\"\n", + " }\n", + ")\n", + "```\n", + "Similarly, for `RAGExpectedComponent.DOCUMENT_RETRIEVER`: Assume the RAG pipeline has an [`InMemoryEmbeddingRetriever`](https://github.com/deepset-ai/haystack/blob/0ceeb733baabe2b3658ee7065c4441a632ef465d/haystack/components/retrievers/in_memory/embedding_retriever.py#L12) component named `\"mem_retriever\"` and is connected to `\"txt_embedder\"`.\n", + "```python\n", + "document_retriever_metadata = RAGExpectedComponentMetadata(\n", + " name=\"mem_retriever\",\n", + " output_mapping={\n", + " \"retrieved_documents\": \"documents\"\n", + " }\n", + ")\n", + "```\n", + "Both `\"query\"` and `\"retrieved_documents\"` are \"meta\" identifiers used by the harness to specify expected inputs and outputs - They are specific to each `RAGExpectedComponent` enum variant and are documented in their docstrings." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a harness to evalaute a custom RAG pipeline.\n", + "# Commented out because the pipeline is not defined in this notebook.\n", + "\n", + "# custom_eval_harness = RAGEvaluationHarness(\n", + "# rag_pipeline=custom_rag_pipeline,\n", + "# rag_components={\n", + "# RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(\n", + "# \"query_embedder\", input_mapping={\"query\": \"text\"}\n", + "# ),\n", + "# RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata(\n", + "# \"retriever\",\n", + "# output_mapping={\"retrieved_documents\": \"documents\"},\n", + "# ),\n", + "# RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata(\n", + "# \"generator\", output_mapping={\"replies\": \"replies\"}\n", + "# ),\n", + "# },\n", + "# metrics={\n", + "# RAGEvaluationMetric.DOCUMENT_MAP,\n", + "# RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,\n", + "# RAGEvaluationMetric.ANSWER_FAITHFULNESS\n", + "# })" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is no strict requirement when it comes which components can act as a query processor, a document retriever or a response generator. For instance, it's perfecty fine if the query processor and the document retriever are the same component. In fact, this is the case when using a keyword-based retriever which directly accepts the query (as opposed to having a query embedder in front of it)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}