diff --git a/cookbook/retrieval-agent-Pinecone.ipynb b/cookbook/retrieval-agent-Pinecone.ipynb new file mode 100644 index 0000000000000..852445c6cc1fe --- /dev/null +++ b/cookbook/retrieval-agent-Pinecone.ipynb @@ -0,0 +1,410 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "markdown" + } + }, + "outputs": [], + "source": [ + "# Overview\n", + "\n", + "This notebook demonstrates the process of creating a knowledge retrieval system using various tools and libraries. The workflow includes:\n", + "\n", + "1. **Data Preparation**: Loading and preprocessing a dataset to be used for knowledge retrieval.\n", + "2. **Embedding Generation**: Using a pre-trained Sentence Transformer model to generate embeddings for the dataset.\n", + "3. **Indexing**: Creating and populating a Pinecone index with the generated embeddings.\n", + "4. **Querying**: Implementing a retrieval system to search for relevant documents based on user queries.\n", + "5. **Custom Agent Tool**: Setting up a custom agent tool using LangChain to interact with the knowledge base and answer questions.\n", + "\n", + "By the end of this notebook, you will have a functional knowledge retrieval system capable of answering queries based on the provided dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_groq import ChatGroq\n", + "from langchain.chains import LLMChain\n", + "from langchain.agents import initialize_agent\n", + "from langchain.tools import Tool,BaseTool\n", + "from langchain.prompts import PromptTemplate\n", + "from langchain.memory import ConversationBufferMemory\n", + "import os\n", + "from datasets import load_dataset\n", + "import pandas as pd\n", + "from sentence_transformers import SentenceTransformer\n", + "import warnings\n", + "from pinecone import Pinecone,ServerlessSpec" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "markdown" + } + }, + "outputs": [], + "source": [ + "## Setting up Data for Indexing\n", + "\n", + "### Stanford Question Answering Dataset (SQuAD)\n", + "\n", + "SQuAD 1.1 contains 100,000+ question-answer pairs on 500+ articles." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'validation': 'plain_text/validation-00000-of-00001.parquet'}\n", + "df = pd.read_parquet(\"hf://datasets/rajpurkar/squad/\" + splits[\"train\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop_duplicates(subset='context',inplace=True)\n", + "df.head() # as we need context to store in db" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df['answers'].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Setting up the Groq inference key for any LLM" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "llm = ChatGroq(temperature=0.0, model='mixtral-8x7b-32768', api_key=os.getenv('GROQ_API_KEY'),verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "markdown" + } + }, + "outputs": [], + "source": [ + "### Sentence-Transformers: all-MiniLM-L6-v2\n", + "\n", + "`all-MiniLM-L6-v2` is a compact transformer model for generating sentence embeddings, part of the Sentence-Transformers library.\n", + "\n", + "**Technical Details:**\n", + "- **Architecture**: MiniLM-L6\n", + "- **Layers**: 6\n", + "- **Embedding Size**: 384 dimensions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n", + "embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "markdown" + } + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "### Setting up Pinecone API Key\n", + "\n", + "To use Pinecone for indexing and querying, you need to set up your Pinecone API key. Follow these steps:\n", + "\n", + "1. **Sign Up/Log In to Pinecone**: If you don't have an account, sign up at [Pinecone](https://www.pinecone.io/). If you already have an account, log in.\n", + "\n", + "2. **Get API Key**: Once logged in, navigate to the API keys section in your Pinecone dashboard. Create a new API key if you don't have one, and copy it.\n", + "\n", + "3. **Set API Key in Environment Variables**: Store your API key in an environment variable for security. You can do this by adding the following line to your `.bashrc`, `.zshrc`, or equivalent shell configuration file:\n", + " ```bash\n", + " export PINECONE_API_KEY='your_api_key_here'\n", + " ```\n", + " Replace `'your_api_key_here'` with the actual API key you copied.\n", + "\n", + "4. **Load API Key in Jupyter Notebook**: In your Jupyter Notebook, load the API key using the `os` module:\n", + " ```python\n", + " api_key = os.getenv('PINECONE_API_KEY')\n", + " ```\n", + "\n", + "By following these steps, you will have set up your Pinecone API key and be ready to use Pinecone for indexing and querying." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "pc=Pinecone(api_key=\"api_key\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "index_name=\"langchain-retrieval-agent\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "pc.create_index(\n", + " name=index_name,\n", + " dimension=384,\n", + " metric=\"cosine\",\n", + " spec=ServerlessSpec(\n", + " cloud='aws',\n", + " region='us-east-1'\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Index=pc.Index(index_name)\n", + "Index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Index.describe_index_stats()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: The process of adding data to Pinecone can be time-consuming due to the large dataset (80,000 rows). Therefore, the index is not being created in this example. However, the steps provided are correct and can be followed for actual implementation.\n", + "\n", + "You can check the entries in Pinecone section\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 500 # Pushing into vector in batch sizes (batch processing)\n", + "\n", + "for i in range(0, len(df), batch_size):\n", + " i_end = i + batch_size\n", + " split_data = df.iloc[i:i_end] # Splitting data frame\n", + " \n", + " # Initialize metadata list for this batch\n", + " metadata = []\n", + "\n", + " # Create metadata entries for each row in the batch\n", + " for index, row_data in split_data.iterrows():\n", + " metadata.append({\n", + " 'title': row_data['title'],\n", + " 'context': row_data['context']\n", + " })\n", + " \n", + " # Extract documents and IDs\n", + " documents = split_data['context'].tolist() # Convert to list of strings\n", + " ids = split_data['id'].tolist() # Convert to list of IDs\n", + " \n", + " # Create document embeddings\n", + " embedded_documents = embeddings.encode(documents)\n", + " \n", + " # Upsert into Pinecone index\n", + " Index.upsert(vectors=zip(ids, embedded_documents, metadata))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.vectorstores import Pinecone\n", + "vector_store = Pinecone(\n", + " index=Index, # Pinecone index instance\n", + " embedding_function=embeddings.encode, # Function to convert queries into embeddings\n", + " metadata_field=\"context\" # Metadata field to retrieve or filter on\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"when was the college of engineering in the University of Notre Dame established?\"\n", + "\n", + "vector_store.similarity_search(\n", + " query, # our search query\n", + " k=3 # return 3 most relevant docs\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Custom Agent Tool\n", + "\n", + "The following chain will now act as a tool for the agent, enabling it to interact with the knowledge base and answer queries effectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.memory import ConversationBufferWindowMemory\n", + "from langchain.chains import RetrievalQA\n", + "\n", + "conversational_memory = ConversationBufferWindowMemory(\n", + " memory_key='chat_history',\n", + " k=5, # 5 conversions memory\n", + " return_messages=True\n", + ")\n", + "# retrieval qa chain\n", + "qa = RetrievalQA.from_chain_type(\n", + " llm=llm,\n", + " chain_type=\"stuff\",\n", + " retriever=vector_store.as_retriever()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Custom_Tool=Tool(\n", + " name='Knowledge Base',\n", + " func=qa.invoke,\n", + " description='Useful for answering general question answers'\n", + ")\n", + "tools=[Custom_Tool]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_agent=initialize_agent(\n", + " agent='chat-conversational-react-description',\n", + " tools=tools,\n", + " llm=llm,\n", + " verbose=True,\n", + " max_iterations=3,\n", + " early_stopping_method='generate',\n", + " memory=conversational_memory\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_agent(\"can you tell me some facts about the University of Notre Dame?\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}