From 29d2bfc662e224ed775d5b27350b191792dfe4fc Mon Sep 17 00:00:00 2001 From: shadeMe Date: Thu, 6 Jun 2024 14:07:09 +0200 Subject: [PATCH] feat: Add notebook for RAG eval harness --- examples/rag_eval_harness.ipynb | 1594 +++++++++++++++++ .../evaluation/harness/rag/harness.py | 96 +- pyproject.toml | 6 +- 3 files changed, 1670 insertions(+), 26 deletions(-) create mode 100644 examples/rag_eval_harness.ipynb diff --git a/examples/rag_eval_harness.ipynb b/examples/rag_eval_harness.ipynb new file mode 100644 index 00000000..b7c18ef6 --- /dev/null +++ b/examples/rag_eval_harness.ipynb @@ -0,0 +1,1594 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "# We assume that the haystack-experimental package is already installed.\n", + "pip install datasets\n", + "pip install sentence-transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's set the OpenAI API key environment variable to ensure that\n", + "# LLM-based evaluators can query the OpenAI API.\n", + "import os\n", + "from getpass import getpass\n", + "if \"OPENAI_API_KEY\" not in os.environ:\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'NoneType' object has no attribute 'cadam32bit_grad_fp32'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mkannan/.pyenv/versions/3.10.13/envs/dev/lib/python3.10/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n", + " warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n" + ] + } + ], + "source": [ + "# All the imports that we'll need to create the following:\n", + "# - An indexing pipeline that stores documents from our chosen dataset in a document store.\n", + "# - A retrieval pipeline that uses a query to retrieve relevant documents from the document store.\n", + "import json\n", + "from typing import List, Dict\n", + "from collections import defaultdict\n", + "from pathlib import Path\n", + "import random\n", + "from datasets import load_dataset, Dataset\n", + "from tqdm import tqdm\n", + "\n", + "from haystack import Document, Pipeline\n", + "from haystack.components.builders import AnswerBuilder, PromptBuilder\n", + "from haystack.components.embedders import (\n", + " SentenceTransformersDocumentEmbedder,\n", + " SentenceTransformersTextEmbedder,\n", + ")\n", + "from haystack.components.generators import OpenAIGenerator\n", + "from haystack.components.retrievers import (\n", + " InMemoryEmbeddingRetriever,\n", + " InMemoryBM25Retriever,\n", + ")\n", + "from haystack.components.writers import DocumentWriter\n", + "\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "from haystack.document_stores.types import DuplicatePolicy, DocumentStore\n", + "from haystack_experimental.evaluation.harness.rag import (\n", + " RAGEvaluationHarness,\n", + " RAGEvaluationMetric,\n", + " RAGEvaluationInput,\n", + " RAGEvaluationOutput,\n", + " RAGEvaluationOverrides,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset preparation\n", + "\n", + "The following steps will load the SQUAD dataset, preprocess them for the indexing pipeline and store them to a local folder in the current working directory." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Helper functions to load the SQUAD dataset.\n", + "def aggregate_wiki_title(data: Dataset, agg_wiki_title: Dict[str, Dict[str, List[str]]]):\n", + " for idx, x in enumerate(data.iter(batch_size=1)):\n", + " if x[\"context\"] not in agg_wiki_title[x[\"title\"][0]][\"context\"]:\n", + " agg_wiki_title[x[\"title\"][0]][\"context\"].append(x[\"context\"])\n", + " agg_wiki_title[x[\"title\"][0]][\"question_answers\"].append(\n", + " {\"question\": x[\"question\"], \"answers\": x[\"answers\"]}\n", + " )\n", + "\n", + "def load_transformed_squad():\n", + " with open(\"transformed_squad/questions.jsonl\", \"r\") as f:\n", + " questions = [json.loads(x) for x in f.readlines()]\n", + " for idx, question in enumerate(questions):\n", + " question[\"query_id\"] = f\"query_{idx}\"\n", + "\n", + " def create_document(text: str, name: str):\n", + " return Document(content=text, meta={\"name\": name})\n", + "\n", + " # walk through the files in the directory and transform each text file into a Document\n", + " documents = []\n", + " for root, dirs, files in os.walk(\"transformed_squad/articles/\"):\n", + " for article in files:\n", + " with open(f\"{root}/{article}\", \"r\") as f:\n", + " raw_texts = f.read().split(\"\\n\")\n", + " for text in raw_texts:\n", + " documents.append(\n", + " create_document(text, article.replace(\".txt\", \"\"))\n", + " )\n", + "\n", + " return questions, documents" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 490/490 [00:00<00:00, 66949.28it/s]\n" + ] + } + ], + "source": [ + "data_train = load_dataset(\"squad\", split=\"train\")\n", + "data_validation = load_dataset(\"squad\", split=\"validation\")\n", + "agg_wiki_title = defaultdict(\n", + " lambda: {\"context\": [], \"question_answers\": [], \"text\": \"\"}\n", + ")\n", + "aggregate_wiki_title(data_train, agg_wiki_title)\n", + "aggregate_wiki_title(data_validation, agg_wiki_title)\n", + "\n", + "# merge the context into a single document\n", + "for article in tqdm(agg_wiki_title.keys()):\n", + " agg_wiki_title[article][\"text\"] = \"\\n\".join(\n", + " [x[0] for x in agg_wiki_title[article][\"context\"]]\n", + " )\n", + "\n", + "# create documents\n", + "for article in agg_wiki_title.keys():\n", + " out_path = Path(\"transformed_squad/articles/\")\n", + " out_path.mkdir(parents=True, exist_ok=True)\n", + " with open(f\"{str(out_path)}/{article}.txt\", \"w\") as f:\n", + " f.write(agg_wiki_title[article][\"text\"])\n", + "\n", + "# create question/answers\n", + "questions = Path(\"transformed_squad/\")\n", + "questions.mkdir(parents=True, exist_ok=True)\n", + "with open(f\"{str(questions)}/questions.jsonl\", \"w\") as f:\n", + " for article in agg_wiki_title.keys():\n", + " for entry in agg_wiki_title[article][\"question_answers\"]:\n", + " f.write(\n", + " json.dumps(\n", + " {\n", + " \"question\": entry[\"question\"][0],\n", + " \"document\": article,\n", + " \"answers\": entry[\"answers\"][0],\n", + " }\n", + " )\n", + " + \"\\n\"\n", + " )\n", + "\n", + "questions, documents = load_transformed_squad()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Indexing pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Helper function to create a pipeline that indexes the documents in the document store.\n", + "def indexing(documents: List[Document]) -> InMemoryDocumentStore:\n", + " document_store = InMemoryDocumentStore()\n", + "\n", + " doc_writer = DocumentWriter(\n", + " document_store=document_store, policy=DuplicatePolicy.SKIP\n", + " )\n", + " doc_embedder = SentenceTransformersDocumentEmbedder(\n", + " model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + " )\n", + "\n", + " ingestion_pipe = Pipeline()\n", + " ingestion_pipe.add_component(instance=doc_embedder, name=\"doc_embedder\")\n", + " ingestion_pipe.add_component(instance=doc_writer, name=\"doc_writer\")\n", + "\n", + " ingestion_pipe.connect(\"doc_embedder.documents\", \"doc_writer.documents\")\n", + " ingestion_pipe.run({\"doc_embedder\": {\"documents\": documents}})\n", + "\n", + " return document_store" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mkannan/.pyenv/versions/3.10.13/envs/dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "20fe295da31045df9d92f63762587064", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/662 [00:00 Pipeline:\n", + " template = \"\"\"\n", + " You have to answer the following question based on the given context information only.\n", + "\n", + " Context:\n", + " {% for document in documents %}\n", + " {{ document.content }}\n", + " {% endfor %}\n", + "\n", + " Question: {{question}}\n", + " Answer:\n", + " \"\"\"\n", + "\n", + " pipeline = Pipeline()\n", + " pipeline.add_component(\n", + " \"query_embedder\",\n", + " SentenceTransformersTextEmbedder(\n", + " model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + " ),\n", + " )\n", + " pipeline.add_component(\n", + " \"retriever\", InMemoryEmbeddingRetriever(document_store, top_k=top_k)\n", + " )\n", + " pipeline.add_component(\"prompt_builder\", PromptBuilder(template=template))\n", + " pipeline.add_component(\n", + " \"generator\", OpenAIGenerator(model=\"gpt-3.5-turbo\")\n", + " )\n", + " pipeline.add_component(\"answer_builder\", AnswerBuilder())\n", + "\n", + " pipeline.connect(\"query_embedder\", \"retriever.query_embedding\")\n", + " pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n", + " pipeline.connect(\"prompt_builder\", \"generator\")\n", + " pipeline.connect(\"generator.replies\", \"answer_builder.replies\")\n", + " pipeline.connect(\"generator.meta\", \"answer_builder.meta\")\n", + " pipeline.connect(\"retriever\", \"answer_builder.documents\")\n", + "\n", + " return pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Helper function to create an keyword-based RAG pipeline.\n", + "def build_keyword_rag_pipeline(document_store: InMemoryDocumentStore, top_k: int = 2) -> Pipeline:\n", + " template = \"\"\"\n", + " You have to answer the following question based on the given context information only.\n", + "\n", + " Context:\n", + " {% for document in documents %}\n", + " {{ document.content }}\n", + " {% endfor %}\n", + "\n", + " Question: {{question}}\n", + " Answer:\n", + " \"\"\"\n", + "\n", + " pipeline = Pipeline()\n", + " pipeline.add_component(\n", + " \"retriever\", InMemoryBM25Retriever(document_store, top_k=top_k)\n", + " )\n", + " pipeline.add_component(\"prompt_builder\", PromptBuilder(template=template))\n", + " pipeline.add_component(\n", + " \"generator\", OpenAIGenerator(model=\"gpt-3.5-turbo\")\n", + " )\n", + " pipeline.add_component(\"answer_builder\", AnswerBuilder())\n", + "\n", + " pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n", + " pipeline.connect(\"prompt_builder\", \"generator\")\n", + " pipeline.connect(\"generator.replies\", \"answer_builder.replies\")\n", + " pipeline.connect(\"generator.meta\", \"answer_builder.meta\")\n", + " pipeline.connect(\"retriever\", \"answer_builder.documents\")\n", + "\n", + " return pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "emb_rag_pipeline = build_emb_rag_pipeline(document_store, top_k=2)\n", + "keyword_rag_pipeline = build_keyword_rag_pipeline(document_store, top_k=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation harness\n", + "\n", + "The RAG evaluation harness comes with a predefined set of evaluation metrics, which are enumerated in the `RAGEvaluationMetric` enum. \n", + "\n", + "The `RAGEvaluationHarness` class comes with default initialization functions that can be used with RAG pipelines that use typical names/identifiers for their components." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a harness to evalaute the embedding-based RAG pipeline.\n", + "emb_eval_harness = RAGEvaluationHarness.default_with_embedding_retriever(emb_rag_pipeline, metrics={\n", + " RAGEvaluationMetric.DOCUMENT_MAP,\n", + " RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,\n", + " RAGEvaluationMetric.ANSWER_FAITHFULNESS\n", + " })\n", + "keyword_eval_harness = RAGEvaluationHarness.default_with_keyword_retriever(keyword_rag_pipeline, metrics={\n", + " RAGEvaluationMetric.DOCUMENT_MAP,\n", + " RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,\n", + " RAGEvaluationMetric.ANSWER_FAITHFULNESS\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the inputs to the evaluation harness.\n", + "# These inputs will be automatically passed to RAG pipeline \n", + "# and the evaluation pipeline that the harness internally uses.\n", + "\n", + "input_questions = random.sample(questions, 10)\n", + "\n", + "eval_harness_input = RAGEvaluationInput(\n", + " queries=[q[\"question\"] for q in input_questions],\n", + " ground_truth_answers=[q[\"answers\"][\"text\"][0] for q in input_questions],\n", + " ground_truth_documents=[\n", + " [\n", + " doc\n", + " for doc in document_store.storage.values()\n", + " if doc.meta[\"name\"] == q[\"document\"]\n", + " ]\n", + " for q in input_questions\n", + " ],\n", + " additional_rag_inputs={\n", + " \"prompt_builder\": {\"question\": [q[\"question\"] for q in input_questions]},\n", + " \"answer_builder\": {\"query\": [q[\"question\"] for q in input_questions]},\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ba7fddb6f307443cbef45f91f38d3e3f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metricsscore
0metric_doc_recall_single0.7
1metric_answer_faithfulness0.7
2metric_doc_map0.6
\n", + "" + ], + "text/plain": [ + " metrics score\n", + "0 metric_doc_recall_single 0.7\n", + "1 metric_answer_faithfulness 0.7\n", + "2 metric_doc_map 0.6" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Evaluation score report:\")\n", + "emb_eval_run.results.score_report()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluation score dataframe:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionscontextsresponsesmetric_doc_recall_singlemetric_answer_faithfulnessmetric_doc_map
0Upon what are kings of Scots coronated?[Normans came into Scotland, building castles ...The kings of Scots are coronated on the Stone ...0.01.00.0
1Where is the energy stored by a capacitor loca...[A capacitor (originally known as a condenser)...The energy stored by a capacitor is located in...1.01.00.5
2What did these gardeners do about unwanted spe...[Forest gardening was also being used as a foo...The gardeners identified, protected, and impro...1.01.01.0
3What is one type of Benedictine order that was...[The Catholic Church prevailed across Europe a...One type of Benedictine order that was common ...1.00.01.0
4When did work on the ASCII standard begin?[ASCII developed from telegraphic codes. Its f...Work on the ASCII standard began on October 6,...1.00.01.0
5Where did Africans escape and mate with naitves?[Numerous communities of dark-skinned peoples ...Africans escaped and mated with natives in pre...0.01.00.0
6What direction has Europe moved towards?[Modern historiography on the period has reach...Europe has moved towards an era characterized ...0.01.00.0
7What was the Office of Special Operations init...[US army general Hoyt Vandenberg, the CIG's se...The initial budget of the Office of Special Op...1.00.01.0
8What is the large art school in Mexico City?[During the 19th century, an important produce...The large art school in Mexico City is the Esc...1.01.00.5
9When did the Hounslow Heath Aerodrome begin to...[Following the war, some of these military air...Hounslow Heath Aerodrome began to operate sche...1.01.01.0
\n", + "
" + ], + "text/plain": [ + " questions \\\n", + "0 Upon what are kings of Scots coronated? \n", + "1 Where is the energy stored by a capacitor loca... \n", + "2 What did these gardeners do about unwanted spe... \n", + "3 What is one type of Benedictine order that was... \n", + "4 When did work on the ASCII standard begin? \n", + "5 Where did Africans escape and mate with naitves? \n", + "6 What direction has Europe moved towards? \n", + "7 What was the Office of Special Operations init... \n", + "8 What is the large art school in Mexico City? \n", + "9 When did the Hounslow Heath Aerodrome begin to... \n", + "\n", + " contexts \\\n", + "0 [Normans came into Scotland, building castles ... \n", + "1 [A capacitor (originally known as a condenser)... \n", + "2 [Forest gardening was also being used as a foo... \n", + "3 [The Catholic Church prevailed across Europe a... \n", + "4 [ASCII developed from telegraphic codes. Its f... \n", + "5 [Numerous communities of dark-skinned peoples ... \n", + "6 [Modern historiography on the period has reach... \n", + "7 [US army general Hoyt Vandenberg, the CIG's se... \n", + "8 [During the 19th century, an important produce... \n", + "9 [Following the war, some of these military air... \n", + "\n", + " responses \\\n", + "0 The kings of Scots are coronated on the Stone ... \n", + "1 The energy stored by a capacitor is located in... \n", + "2 The gardeners identified, protected, and impro... \n", + "3 One type of Benedictine order that was common ... \n", + "4 Work on the ASCII standard began on October 6,... \n", + "5 Africans escaped and mated with natives in pre... \n", + "6 Europe has moved towards an era characterized ... \n", + "7 The initial budget of the Office of Special Op... \n", + "8 The large art school in Mexico City is the Esc... \n", + "9 Hounslow Heath Aerodrome began to operate sche... \n", + "\n", + " metric_doc_recall_single metric_answer_faithfulness metric_doc_map \n", + "0 0.0 1.0 0.0 \n", + "1 1.0 1.0 0.5 \n", + "2 1.0 1.0 1.0 \n", + "3 1.0 0.0 1.0 \n", + "4 1.0 0.0 1.0 \n", + "5 0.0 1.0 0.0 \n", + "6 0.0 1.0 0.0 \n", + "7 1.0 0.0 1.0 \n", + "8 1.0 1.0 0.5 \n", + "9 1.0 1.0 1.0 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Evaluation score dataframe:\")\n", + "emb_eval_run.results.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0008b3059ab549beb89017859ef2aac6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionscontextsresponsesemb_eval_run_metric_doc_recall_singleemb_eval_run_metric_answer_faithfulnessemb_eval_run_metric_doc_mapemb_eval_run_gpt4_metric_doc_recall_singleemb_eval_run_gpt4_metric_answer_faithfulnessemb_eval_run_gpt4_metric_doc_map
0Upon what are kings of Scots coronated?[Normans came into Scotland, building castles ...The kings of Scots are coronated on the Stone ...0.01.00.00.01.00.0
1Where is the energy stored by a capacitor loca...[A capacitor (originally known as a condenser)...The energy stored by a capacitor is located in...1.01.00.51.01.00.5
2What did these gardeners do about unwanted spe...[Forest gardening was also being used as a foo...The gardeners identified, protected, and impro...1.01.01.01.01.01.0
3What is one type of Benedictine order that was...[The Catholic Church prevailed across Europe a...One type of Benedictine order that was common ...1.00.01.01.00.01.0
4When did work on the ASCII standard begin?[ASCII developed from telegraphic codes. Its f...Work on the ASCII standard began on October 6,...1.00.01.01.00.01.0
5Where did Africans escape and mate with naitves?[Numerous communities of dark-skinned peoples ...Africans escaped and mated with natives in pre...0.01.00.00.01.00.0
6What direction has Europe moved towards?[Modern historiography on the period has reach...Europe has moved towards an era characterized ...0.01.00.00.01.00.0
7What was the Office of Special Operations init...[US army general Hoyt Vandenberg, the CIG's se...The initial budget of the Office of Special Op...1.00.01.01.00.01.0
8What is the large art school in Mexico City?[During the 19th century, an important produce...The large art school in Mexico City is the Esc...1.01.00.51.01.00.5
9When did the Hounslow Heath Aerodrome begin to...[Following the war, some of these military air...Hounslow Heath Aerodrome began to operate sche...1.01.01.01.01.01.0
\n", + "" + ], + "text/plain": [ + " questions \\\n", + "0 Upon what are kings of Scots coronated? \n", + "1 Where is the energy stored by a capacitor loca... \n", + "2 What did these gardeners do about unwanted spe... \n", + "3 What is one type of Benedictine order that was... \n", + "4 When did work on the ASCII standard begin? \n", + "5 Where did Africans escape and mate with naitves? \n", + "6 What direction has Europe moved towards? \n", + "7 What was the Office of Special Operations init... \n", + "8 What is the large art school in Mexico City? \n", + "9 When did the Hounslow Heath Aerodrome begin to... \n", + "\n", + " contexts \\\n", + "0 [Normans came into Scotland, building castles ... \n", + "1 [A capacitor (originally known as a condenser)... \n", + "2 [Forest gardening was also being used as a foo... \n", + "3 [The Catholic Church prevailed across Europe a... \n", + "4 [ASCII developed from telegraphic codes. Its f... \n", + "5 [Numerous communities of dark-skinned peoples ... \n", + "6 [Modern historiography on the period has reach... \n", + "7 [US army general Hoyt Vandenberg, the CIG's se... \n", + "8 [During the 19th century, an important produce... \n", + "9 [Following the war, some of these military air... \n", + "\n", + " responses \\\n", + "0 The kings of Scots are coronated on the Stone ... \n", + "1 The energy stored by a capacitor is located in... \n", + "2 The gardeners identified, protected, and impro... \n", + "3 One type of Benedictine order that was common ... \n", + "4 Work on the ASCII standard began on October 6,... \n", + "5 Africans escaped and mated with natives in pre... \n", + "6 Europe has moved towards an era characterized ... \n", + "7 The initial budget of the Office of Special Op... \n", + "8 The large art school in Mexico City is the Esc... \n", + "9 Hounslow Heath Aerodrome began to operate sche... \n", + "\n", + " emb_eval_run_metric_doc_recall_single \\\n", + "0 0.0 \n", + "1 1.0 \n", + "2 1.0 \n", + "3 1.0 \n", + "4 1.0 \n", + "5 0.0 \n", + "6 0.0 \n", + "7 1.0 \n", + "8 1.0 \n", + "9 1.0 \n", + "\n", + " emb_eval_run_metric_answer_faithfulness emb_eval_run_metric_doc_map \\\n", + "0 1.0 0.0 \n", + "1 1.0 0.5 \n", + "2 1.0 1.0 \n", + "3 0.0 1.0 \n", + "4 0.0 1.0 \n", + "5 1.0 0.0 \n", + "6 1.0 0.0 \n", + "7 0.0 1.0 \n", + "8 1.0 0.5 \n", + "9 1.0 1.0 \n", + "\n", + " emb_eval_run_gpt4_metric_doc_recall_single \\\n", + "0 0.0 \n", + "1 1.0 \n", + "2 1.0 \n", + "3 1.0 \n", + "4 1.0 \n", + "5 0.0 \n", + "6 0.0 \n", + "7 1.0 \n", + "8 1.0 \n", + "9 1.0 \n", + "\n", + " emb_eval_run_gpt4_metric_answer_faithfulness \\\n", + "0 1.0 \n", + "1 1.0 \n", + "2 1.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "5 1.0 \n", + "6 1.0 \n", + "7 0.0 \n", + "8 1.0 \n", + "9 1.0 \n", + "\n", + " emb_eval_run_gpt4_metric_doc_map \n", + "0 0.0 \n", + "1 0.5 \n", + "2 1.0 \n", + "3 1.0 \n", + "4 1.0 \n", + "5 0.0 \n", + "6 0.0 \n", + "7 1.0 \n", + "8 0.5 \n", + "9 1.0 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Compare the results of the two evaluation runs.\n", + "print(\"Comparison of the two evaluation runs:\")\n", + "emb_eval_run.results.comparative_individual_scores_report(emb_eval_run_gpt4.results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the above code, we've primarily focused on using the `default_xxx` methods of the `RAGEvaluationHarness` class. They provide a straightforward way of getting started with the evaluation of simple RAG pipelines which use prototypical components. The harness can also be used to evaluate arbitrarily complex RAG pipelines. This is done by providing the harness with some extra metadata about the pipeline to be evaluated.\n", + "\n", + "To use an arbitrary pipeline with the harness, the latter requires information about the following components (c.f `RAGExpectedComponent`):\n", + "- Query processor - Component that processes the input query. \n", + " - Expects one input that contains the query string.\n", + "- Document retriever - Component that retrieves documents based on the input query.\n", + " - Expects one output that contains the retrieved documents.\n", + "- Response generator - Component that generates responses based on the query and the retrieved documents.\n", + " - Expects one output that contains the LLM's response(s).\n", + "\n", + "For each of the above, the user needs to provide the following metadata (c.f `RAGExpectedComponentMetadata`):\n", + "- The name of the component as seen in the pipeline.\n", + "- A mapping of the component's expected inputs to their corresponding input names.\n", + "- A mapping of the component's expected outputs to their corresponding output names.\n", + "\n", + "For example, let's consider `RAGExpectedComponent.QUERY_PROCESSOR`: Assume we have a RAG pipeline with an [`OpenAITextEmbedder`](https://github.com/deepset-ai/haystack/blob/0ceeb733baabe2b3658ee7065c4441a632ef465d/haystack/components/embedders/openai_text_embedder.py#L18) component called `\"txt_embedder\"`. Since the harness is responsible for passing the pipeline's input (the query) to the `OpenAITextEmbedder`, it needs to know the name of the component. Furthermore, it also needs to know the [name of `OpenAITextEmbedder`'s input](https://github.com/deepset-ai/haystack/blob/0ceeb733baabe2b3658ee7065c4441a632ef465d/haystack/components/embedders/openai_text_embedder.py#L135) through which the query should be supplied. The metadata for the above looks thus:\n", + "```python\n", + "query_processor_metadata = RAGExpectedComponentMetadata(\n", + " name=\"txt_embedder\",\n", + " input_mapping={\n", + " \"query\": \"text\"\n", + " }\n", + ")\n", + "```\n", + "Similarly, for `RAGExpectedComponent.DOCUMENT_RETRIEVER`: Assume the RAG pipeline has an [`InMemoryEmbeddingRetriever`](https://github.com/deepset-ai/haystack/blob/0ceeb733baabe2b3658ee7065c4441a632ef465d/haystack/components/retrievers/in_memory/embedding_retriever.py#L12) component named `\"mem_retriever\"` and is connected to `\"txt_embedder\"`.\n", + "```python\n", + "document_retriever_metadata = RAGExpectedComponentMetadata(\n", + " name=\"mem_retriever\",\n", + " output_mapping={\n", + " \"retrieved_documents\": \"documents\"\n", + " }\n", + ")\n", + "```\n", + "Both `\"query\"` and `\"retrieved_documents\"` are \"meta\" identifiers used by the harness to specify expected inputs and outputs - They are specific to each `RAGExpectedComponent` enum variant and are documented in their docstrings." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a harness to evalaute a custom RAG pipeline.\n", + "# Commented out because the pipeline is not defined in this notebook.\n", + "\n", + "# custom_eval_harness = RAGEvaluationHarness(\n", + "# rag_pipeline=custom_rag_pipeline,\n", + "# rag_components={\n", + "# RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(\n", + "# \"query_embedder\", input_mapping={\"query\": \"text\"}\n", + "# ),\n", + "# RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata(\n", + "# \"retriever\",\n", + "# output_mapping={\"retrieved_documents\": \"documents\"},\n", + "# ),\n", + "# RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata(\n", + "# \"generator\", output_mapping={\"replies\": \"replies\"}\n", + "# ),\n", + "# },\n", + "# metrics={\n", + "# RAGEvaluationMetric.DOCUMENT_MAP,\n", + "# RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,\n", + "# RAGEvaluationMetric.ANSWER_FAITHFULNESS\n", + "# })" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is no strict requirement when it comes which components can act as a query processor, a document retriever or a response generator. For instance, it's perfecty fine if the query processor and the document retriever are the same component. In fact, this is the case when using a keyword-based retriever which directly accepts the query (as opposed to having a query embedder in front of it)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/haystack_experimental/evaluation/harness/rag/harness.py b/haystack_experimental/evaluation/harness/rag/harness.py index f0b51165..cad1227e 100644 --- a/haystack_experimental/evaluation/harness/rag/harness.py +++ b/haystack_experimental/evaluation/harness/rag/harness.py @@ -25,7 +25,9 @@ ) -class RAGEvaluationHarness(EvaluationHarness[RAGEvaluationInput, RAGEvaluationOverrides, RAGEvaluationOutput]): +class RAGEvaluationHarness( + EvaluationHarness[RAGEvaluationInput, RAGEvaluationOverrides, RAGEvaluationOutput] +): """ Evaluation harness for evaluating RAG pipelines. """ @@ -167,7 +169,9 @@ def _lookup_component_output( output_name = mapping[output_name] return outputs[name][output_name] - def _generate_eval_run_pipelines(self, overrides: Optional[RAGEvaluationOverrides]) -> PipelinePair: + def _generate_eval_run_pipelines( + self, overrides: Optional[RAGEvaluationOverrides] + ) -> PipelinePair: if overrides is None: rag_overrides = None eval_overrides = None @@ -178,7 +182,9 @@ def _generate_eval_run_pipelines(self, overrides: Optional[RAGEvaluationOverride if eval_overrides is not None: for metric in eval_overrides.keys(): if metric not in self.metrics: - raise ValueError(f"Cannot override parameters of unused evaluation metric '{metric.value}'") + raise ValueError( + f"Cannot override parameters of unused evaluation metric '{metric.value}'" + ) eval_overrides = {k.value: v for k, v in eval_overrides.items()} # type: ignore @@ -198,13 +204,21 @@ def _generate_eval_run_pipelines(self, overrides: Optional[RAGEvaluationOverride }, ) - def _aggregate_rag_outputs(self, outputs: List[Dict[str, Dict[str, Any]]]) -> Dict[str, Dict[str, Any]]: + def _aggregate_rag_outputs( + self, outputs: List[Dict[str, Dict[str, Any]]] + ) -> Dict[str, Dict[str, Any]]: aggregate = aggregate_batched_pipeline_outputs(outputs) # We only care about the first response from the generator. - generator_name = self.rag_components[RAGExpectedComponent.RESPONSE_GENERATOR].name - replies_output_name = self.rag_components[RAGExpectedComponent.RESPONSE_GENERATOR].output_mapping["replies"] - aggregate[generator_name][replies_output_name] = [r[0] for r in aggregate[generator_name][replies_output_name]] + generator_name = self.rag_components[ + RAGExpectedComponent.RESPONSE_GENERATOR + ].name + replies_output_name = self.rag_components[ + RAGExpectedComponent.RESPONSE_GENERATOR + ].output_mapping["replies"] + aggregate[generator_name][replies_output_name] = [ + r[0] for r in aggregate[generator_name][replies_output_name] + ] return aggregate @@ -247,7 +261,10 @@ def _map_rag_eval_pipeline_io(self) -> Dict[str, List[str]]: RAGExpectedComponent.DOCUMENT_RETRIEVER, "retrieved_documents", ), - "responses": (RAGExpectedComponent.RESPONSE_GENERATOR, "replies"), + "predicted_answers": ( + RAGExpectedComponent.RESPONSE_GENERATOR, + "replies", + ), }, } @@ -266,9 +283,15 @@ def _map_rag_eval_pipeline_io(self) -> Dict[str, List[str]]: return outputs_to_inputs - def _prepare_rag_pipeline_inputs(self, inputs: RAGEvaluationInput) -> List[Dict[str, Dict[str, Any]]]: - query_embedder_name = self.rag_components[RAGExpectedComponent.QUERY_PROCESSOR].name - query_embedder_text_input = self.rag_components[RAGExpectedComponent.QUERY_PROCESSOR].input_mapping["query"] + def _prepare_rag_pipeline_inputs( + self, inputs: RAGEvaluationInput + ) -> List[Dict[str, Dict[str, Any]]]: + query_embedder_name = self.rag_components[ + RAGExpectedComponent.QUERY_PROCESSOR + ].name + query_embedder_text_input = self.rag_components[ + RAGExpectedComponent.QUERY_PROCESSOR + ].input_mapping["query"] if inputs.additional_rag_inputs is not None: # Ensure that the query embedder input is not provided as additional input. @@ -284,14 +307,22 @@ def _prepare_rag_pipeline_inputs(self, inputs: RAGEvaluationInput) -> List[Dict[ rag_inputs = deepcopy(inputs.additional_rag_inputs) if query_embedder_name not in rag_inputs: rag_inputs[query_embedder_name] = {} - rag_inputs[query_embedder_name][query_embedder_text_input] = deepcopy(inputs.queries) + rag_inputs[query_embedder_name][query_embedder_text_input] = deepcopy( + inputs.queries + ) else: - rag_inputs = {query_embedder_name: {query_embedder_text_input: deepcopy(inputs.queries)}} + rag_inputs = { + query_embedder_name: { + query_embedder_text_input: deepcopy(inputs.queries) + } + } separate_rag_inputs = deaggregate_batched_pipeline_inputs(rag_inputs) return separate_rag_inputs - def _prepare_eval_pipeline_additional_inputs(self, inputs: RAGEvaluationInput) -> Dict[str, Dict[str, Any]]: + def _prepare_eval_pipeline_additional_inputs( + self, inputs: RAGEvaluationInput + ) -> Dict[str, Dict[str, Any]]: eval_inputs: Dict[str, Dict[str, List[Any]]] = {} for metric in self.metrics: @@ -302,18 +333,30 @@ def _prepare_eval_pipeline_additional_inputs(self, inputs: RAGEvaluationInput) - RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT, ): if inputs.ground_truth_documents is None: - raise ValueError(f"Ground truth documents required for metric '{metric.value}'.") + raise ValueError( + f"Ground truth documents required for metric '{metric.value}'." + ) if len(inputs.ground_truth_documents) != len(inputs.queries): - raise ValueError("Length of ground truth documents should match the number of queries.") + raise ValueError( + "Length of ground truth documents should match the number of queries." + ) - eval_inputs[metric.value] = {"ground_truth_documents": inputs.ground_truth_documents} + eval_inputs[metric.value] = { + "ground_truth_documents": inputs.ground_truth_documents + } elif metric == RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: if inputs.ground_truth_answers is None: - raise ValueError(f"Ground truth answers required for metric '{metric.value}'.") + raise ValueError( + f"Ground truth answers required for metric '{metric.value}'." + ) if len(inputs.ground_truth_answers) != len(inputs.queries): - raise ValueError("Length of ground truth answers should match the number of queries.") + raise ValueError( + "Length of ground truth answers should match the number of queries." + ) - eval_inputs[metric.value] = {"ground_truth_answers": inputs.ground_truth_answers} + eval_inputs[metric.value] = { + "ground_truth_answers": inputs.ground_truth_answers + } elif metric == RAGEvaluationMetric.ANSWER_FAITHFULNESS: eval_inputs[metric.value] = {"questions": inputs.queries} @@ -326,13 +369,20 @@ def _validate_rag_components( ): for e in RAGExpectedComponent: if e not in components: - raise ValueError(f"RAG evaluation harness requires metadata for the '{e.value}' component.") + raise ValueError( + f"RAG evaluation harness requires metadata for the '{e.value}' component." + ) - pipeline_outputs = pipeline.outputs(include_components_with_connected_outputs=True) + pipeline_outputs = pipeline.outputs( + include_components_with_connected_outputs=True + ) pipeline_inputs = pipeline.inputs(include_components_with_connected_inputs=True) for component, metadata in components.items(): - if metadata.name not in pipeline_outputs or metadata.name not in pipeline_inputs: + if ( + metadata.name not in pipeline_outputs + or metadata.name not in pipeline_inputs + ): raise ValueError( f"Expected '{component.value}' component named '{metadata.name}' not found in pipeline." ) diff --git a/pyproject.toml b/pyproject.toml index 85214d0e..78245150 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ extra-dependencies = [ ] [tool.hatch.envs.test.scripts] -unit = 'pytest --cov-report xml:coverage.xml --cov="haystack-experimental" -m "not integration" {args:test}' +unit = 'pytest --cov-report xml:coverage.xml --cov="haystack_experimental" -m "not integration" {args:test}' integration = 'pytest --maxfail=5 -m "integration" {args:test}' typing = "mypy --install-types --non-interactive {args:haystack_experimental}" lint = [ @@ -77,10 +77,10 @@ path = "haystack_experimental/version.py" allow-direct-references = true [tool.hatch.build.targets.sdist] -include = ["/haystack-experimental", "/VERSION.txt"] +include = ["/haystack_experimental", "/VERSION.txt"] [tool.hatch.build.targets.wheel] -packages = ["haystack-experimental"] +packages = ["haystack_experimental"] [tool.codespell] ignore-words-list = "ans,astroid,nd,ned,nin,ue,rouge,ist"