From 8d2990f8cf51a181e6594b5bc2ae19edb705bd00 Mon Sep 17 00:00:00 2001
From: Haseeb <haseebasif5@gmail.com>
Date: Mon, 23 Dec 2024 21:12:47 +0500
Subject: [PATCH] Added GitHub Agent Example

---
 cookbook/Github_Agent.ipynb | 365 ++++++++++++++++++++++++++++++++++++
 1 file changed, 365 insertions(+)
 create mode 100644 cookbook/Github_Agent.ipynb

diff --git a/cookbook/Github_Agent.ipynb b/cookbook/Github_Agent.ipynb
new file mode 100644
index 0000000000000..2e0d2beedb2d5
--- /dev/null
+++ b/cookbook/Github_Agent.ipynb
@@ -0,0 +1,365 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import requests\n",
+    "from dotenv import load_dotenv\n",
+    "from langchain_core.documents import Document"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import faiss\n",
+    "import numpy as np\n",
+    "import re  # For text cleaning\n",
+    "from dotenv import load_dotenv\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "from langchain.vectorstores import VectorStore"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "github_token = os.getenv(\"GITHUB_TOKEN\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "load_dotenv()\n",
+    "\n",
+    "github_token = os.getenv(\"GITHUB_TOKEN\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fetch_github(owner, repo, endpoint):\n",
+    "    url = f\"https://api.github.com/repos/{owner}/{repo}/{endpoint}\"\n",
+    "    headers = {\"Authorization\": f\"Bearer {github_token}\"}\n",
+    "    all_data = []\n",
+    "    page = 1\n",
+    "\n",
+    "    while True:\n",
+    "        response = requests.get(url, headers=headers, params={\"page\": page})\n",
+    "        if response.status_code == 200:\n",
+    "            data = response.json()\n",
+    "            if not data:  # Break if no more data\n",
+    "                break\n",
+    "            all_data.extend(data)\n",
+    "            page += 1\n",
+    "        else:\n",
+    "            print(\"Failed with status code:\", response.status_code)\n",
+    "            return []\n",
+    "\n",
+    "    return all_data\n",
+    "\n",
+    "\n",
+    "def fetch_github_issues(owner, repo,endpoint):\n",
+    "    data = fetch_github(owner, repo, endpoint)\n",
+    "    return load_issues(data,endpoint,repo)\n",
+    "\n",
+    "\n",
+    "def load_issues(data,endpoint,repo):\n",
+    "    docs = []\n",
+    "    for entry in data:\n",
+    "        str_data = entry.get(\"title\", \"\") \n",
+    "        metadata = {\n",
+    "            \"type\": endpoint,\n",
+    "            \"repo\": repo,\n",
+    "            \"author\": entry[\"user\"][\"login\"],\n",
+    "            \"comments\": entry[\"comments\"],\n",
+    "            \"body\": entry[\"body\"],\n",
+    "            \"labels\": entry[\"labels\"],\n",
+    "            \"created_at\": entry[\"created_at\"][0:10], ## slicing the extra part\n",
+    "        }\n",
+    "        if entry['body']:\n",
+    "            str_data += \" \"\n",
+    "            str_data += entry['body']\n",
+    "        doc = Document(page_content=str_data, metadata=metadata)\n",
+    "        docs.append(doc)\n",
+    "\n",
+    "    return docs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "owner = \"microsoft\"\n",
+    "repo = \"DeepSpeed\"\n",
+    "docs = fetch_github_issues(owner, repo, \"issues\")  # Fetch issues from the specified repo\n",
+    "\n",
+    "    # Extract and print the created date of each issue\n",
+    "#for doc in docs:\n",
+    "        #created_at = doc.metadata.get('created_at')\n",
+    "        #print(f\"Issue created at: {created_at}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class FAISStore(VectorStore):\n",
+    "    def __init__(self):\n",
+    "        # Initialize FAISS index with a flat index type\n",
+    "        self._embeddings = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
+    "        d = 384  # Dimension of embeddings\n",
+    "        self.index = faiss.IndexFlatL2(d)  # Use a flat index without clustering\n",
+    "        self.documents = []\n",
+    "\n",
+    "    @property\n",
+    "    def embeddings(self):\n",
+    "        return self._embeddings\n",
+    "\n",
+    "    def add_docs(self, docs):\n",
+    "        vectors_to_upsert = []\n",
+    "\n",
+    "        for doc in docs:\n",
+    "            # Encode the cleaned document content into embeddings\n",
+    "            embed_docs = self.embeddings.encode(doc.page_content).astype('float32')\n",
+    "\n",
+    "            # Create a unique ID for the document\n",
+    "            unique_id = doc.metadata.get(\"author\", \"unknown_author\") + \"_\" + doc.metadata.get(\"type\", \"unknown_type\")\n",
+    "\n",
+    "            # Append vector and unique ID\n",
+    "            vectors_to_upsert.append((unique_id, embed_docs))\n",
+    "\n",
+    "            # Store the document for future retrieval\n",
+    "            self.documents.append((unique_id, doc))  # Store Document object directly\n",
+    "\n",
+    "        # Upsert vectors into FAISS\n",
+    "        embed_docs_array = np.array([vec for _, vec in vectors_to_upsert]).astype('float32')\n",
+    "        self.index.add(embed_docs_array)  # Add vectors to the index\n",
+    "\n",
+    "    def search(self, query, k=1):\n",
+    "        # Encode the query into an embedding\n",
+    "        query_embedding = self.embeddings.encode(query).astype('float32').reshape(1, -1)\n",
+    "\n",
+    "        # Perform the similarity search\n",
+    "        D, I = self.index.search(query_embedding, k=k)\n",
+    "\n",
+    "        # Retrieve metadata and content for the results\n",
+    "        results = []\n",
+    "        for idx in I[0]:\n",
+    "            if idx >= 0:\n",
+    "                unique_id, document = self.documents[idx]\n",
+    "                results.append(document)\n",
+    "\n",
+    "        return results  # Return Document objects\n",
+    "\n",
+    "    def similarity_search(self, query, k=1):\n",
+    "        return self.search(query, k)\n",
+    "\n",
+    "    def from_texts(self, texts, metadatas=None):\n",
+    "        \"\"\" Takes a list of texts and corresponding metadata, creates Documents, and adds them to the vector store. \"\"\"\n",
+    "        docs = [Document(page_content=self.preprocess_content(text), metadata=metadata)\n",
+    "                for text, metadata in zip(texts, metadatas or [{}]*len(texts))]\n",
+    "        self.add_docs(docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "store = FAISStore()\n",
+    "owner = \"microsoft\"\n",
+    "repo = \"DeepSpeed\"\n",
+    "\n",
+    "    # Fetch GitHub pull requests and add them to FAISS\n",
+    "docs = fetch_github_issues(owner, repo, \"issues\")\n",
+    "store.add_docs(docs)\n",
+    "\n",
+    "    # Query the FAISS index\n",
+    "result = store.similarity_search(\"Fix bug with hybrid engine generation\")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_groq import ChatGroq  # Assuming you are using Groq for chat\n",
+    "from langchain.chains import RetrievalQA\n",
+    "from langchain.memory import ConversationBufferMemory\n",
+    "from langchain import hub\n",
+    "from langchain.tools.retriever import create_retriever_tool\n",
+    "from langchain.agents import initialize_agent\n",
+    "from langchain.agents import create_tool_calling_agent\n",
+    "from langchain.agents import AgentExecutor\n",
+    "from langchain.prompts import PromptTemplate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FLAG_FILE = \"data_loaded.flag\"\n",
+    "\n",
+    "class Agent:\n",
+    "    def __init__(self):\n",
+    "        # Initialize FAISS store separately\n",
+    "        self.vector_store = FAISStore()\n",
+    "        \n",
+    "        # Initialize memory for conversation\n",
+    "        self.conversational_memory = ConversationBufferMemory(\n",
+    "            memory_key='chat_history',\n",
+    "            return_messages=True  # Store messages as a list\n",
+    "        )\n",
+    "        \n",
+    "        # Initialize the LLM\n",
+    "        self.llm = ChatGroq(\n",
+    "            temperature=0.0,\n",
+    "            model='llama-3.1-70b-versatile',\n",
+    "            api_key=os.getenv('GROQ_API_KEY'),\n",
+    "            verbose=True\n",
+    "        )\n",
+    "        \n",
+    "    def _run(self, response):\n",
+    "        template = '''This is a response from github agent. Make the Response well Structured and formatted!!\n",
+    "        Here is the response from the agent: {response}'''\n",
+    "        \n",
+    "        prompt = PromptTemplate(template=template, input_variables=['response'])\n",
+    "        formatted_prompt = prompt.format(response=response)\n",
+    "        return self.llm.invoke(formatted_prompt)\n",
+    "        \n",
+    "    \n",
+    "    def initialize(self, owner, repo, endpoint):\n",
+    "        if not os.path.exists(FLAG_FILE):  # Check if the flag file exists\n",
+    "            print(\"No data found in the FAISS store. Fetching data from GitHub...\")\n",
+    "            docs = fetch_github_issues(owner, repo, endpoint)  # Fetch issues/pulls\n",
+    "            if docs:  # Only add if documents were fetched\n",
+    "                self.vector_store.add_docs(docs)  # Add docs to the FAISS store\n",
+    "                with open(FLAG_FILE, \"w\") as f:  # Create a flag file to indicate data has been loaded\n",
+    "                    f.write(\"Data loaded\")\n",
+    "                print(f\"Added {len(docs)} documents to the FAISS store.\")\n",
+    "            else:\n",
+    "                print(\"No documents fetched from GitHub.\")\n",
+    "        else:\n",
+    "            user_input = input(\"Data is already loaded. Do you want to re-fetch it from GitHub? (yes/no): \").strip().lower()\n",
+    "            if user_input == 'yes':\n",
+    "                print(\"Re-fetching data from GitHub...\")\n",
+    "                docs = fetch_github_issues(owner, repo, endpoint)  # Fetch issues/pulls\n",
+    "                if docs:\n",
+    "                    self.vector_store.add_docs(docs)  # Add docs to the FAISS store\n",
+    "                    print(f\"Added {len(docs)} documents to the FAISS store.\")\n",
+    "                else:\n",
+    "                    print(\"No documents fetched from GitHub.\")\n",
+    "            else:\n",
+    "                print(\"Using existing data from the FAISS store.\")\n",
+    "\n",
+    "    def make_agent(self):\n",
+    "        # Set up the retrieval-based question answering chain\n",
+    "        retriever = self.vector_store.as_retriever()  # Use `as_retriever` to make it compatible with RetrievalQA\n",
+    "\n",
+    "        # Create the retriever tool\n",
+    "        self.retriever_tool = create_retriever_tool(\n",
+    "            retriever,\n",
+    "            \"GitHub Search\",\n",
+    "            'The user is asking question which is related to this tool .Use this tool for any question . It will search the GitHub repository for relevant issues and pull requests.'\n",
+    "        )\n",
+    "\n",
+    "        # Initialize the agent\n",
+    "        tools = [self.retriever_tool]\n",
+    "        #prompt = hub.pull(\"hwchase17/openai-functions-agent\")\n",
+    "        #agent = create_tool_calling_agent(self.llm, tools, prompt)\n",
+    "        #self.agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)\n",
+    "        \n",
+    "        self.agent_executor = initialize_agent(\n",
+    "            llm=self.llm,\n",
+    "            agent='conversational-react-description', \n",
+    "            tools=tools,\n",
+    "            verbose=True,\n",
+    "            max_iterations=3,\n",
+    "            memory=self.conversational_memory\n",
+    ")\n",
+    "\n",
+    "    def run_query(self, query):\n",
+    "        \"\"\"Run a query through the agent and return the response.\"\"\"\n",
+    "        response = self.agent_executor({\"input\": query})\n",
+    "        res=self._run(response)\n",
+    "        return res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agent = Agent()\n",
+    "    \n",
+    "    # Initialize the agent with appropriate parameters\n",
+    "agent.initialize(owner='microsoft', repo='DeepSpeed', endpoint='issues')\n",
+    "agent.make_agent()  # Initialize the agent tools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}