feat: RAG pipeline first pass

danny-avila · Mar 18, 2024 · eba72f3 · eba72f3
1 parent de34b67
commit eba72f3
Show file tree

Hide file tree

Showing 9 changed files with 271 additions and 37 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 .venv
 .env
-__pycache__
+__pycache__
+uploads/
+myenv/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.10
+
+WORKDIR /app
+
+# Install pandoc and netcat
+RUN apt-get update \
+    && apt-get install -y pandoc netcat-openbsd \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/README.md b/README.md
diff --git a/config.py b/config.py
@@ -20,9 +20,13 @@ def get_env_variable(var_name: str, default_value: str = None) -> str:
 POSTGRES_PASSWORD = get_env_variable("POSTGRES_PASSWORD")
 DB_HOST = get_env_variable("DB_HOST")
 DB_PORT = get_env_variable("DB_PORT")
+COLLECTION_NAME = get_env_variable("COLLECTION_NAME", "testcollection")
 
 CHUNK_SIZE = int(get_env_variable("CHUNK_SIZE", "1500"))
 CHUNK_OVERLAP = int(get_env_variable("CHUNK_OVERLAP", "100"))
+UPLOAD_DIR = get_env_variable("UPLOAD_DIR", "./uploads/")
+env_value = get_env_variable("PDF_EXTRACT_IMAGES", "False").lower()
+PDF_EXTRACT_IMAGES = True if env_value == "true" else False
 
 CONNECTION_STRING = f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{DB_HOST}:{DB_PORT}/{POSTGRES_DB}"
 
@@ -32,7 +36,52 @@ def get_env_variable(var_name: str, default_value: str = None) -> str:
 pgvector_store = get_vector_store(
     connection_string=CONNECTION_STRING,
     embeddings=embeddings,
-    collection_name="testcollection",
+    collection_name=COLLECTION_NAME,
     mode="async",
 )
 retriever = pgvector_store.as_retriever()
+
+known_source_ext = [
+    "go",
+    "py",
+    "java",
+    "sh",
+    "bat",
+    "ps1",
+    "cmd",
+    "js",
+    "ts",
+    "css",
+    "cpp",
+    "hpp",
+    "h",
+    "c",
+    "cs",
+    "sql",
+    "log",
+    "ini",
+    "pl",
+    "pm",
+    "r",
+    "dart",
+    "dockerfile",
+    "env",
+    "php",
+    "hs",
+    "hsc",
+    "lua",
+    "nginxconf",
+    "conf",
+    "m",
+    "mm",
+    "plsql",
+    "perl",
+    "rb",
+    "rs",
+    "db2",
+    "scala",
+    "bash",
+    "swift",
+    "vue",
+    "svelte",
+]
diff --git a/constants.py b/constants.py
@@ -0,0 +1,16 @@
+from enum import Enum
+
+
+class MESSAGES(str, Enum):
+    DEFAULT = lambda msg="": f"{msg if msg else ''}"
+
+
+class ERROR_MESSAGES(str, Enum):
+    def __str__(self) -> str:
+        return super().__str__()
+
+    DEFAULT = lambda err="": f"Something went wrong :/\n{err if err else ''}"
+    PANDOC_NOT_INSTALLED = "Pandoc is not installed on the server. Please contact your administrator for assistance."
+    OPENAI_NOT_FOUND = lambda name="": f"OpenAI API was not found"
+    OLLAMA_NOT_FOUND = "WebUI could not connect to Ollama"
+    FILE_NOT_FOUND = "The specified file was not found."
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -12,5 +12,19 @@ services:
     ports:
       - "5433:5432"
 
+  fastapi:
+    build: .
+    environment:
+      - DB_HOST=db
+      - DB_PORT=5432
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./uploads:/app/uploads
+    depends_on:
+      - db
+    env_file:
+      - .env
+
 volumes:
-  pgdata2:
+  pgdata2:
diff --git a/main.py b/main.py
@@ -1,19 +1,57 @@
 import os
 
+import hashlib
 from dotenv import find_dotenv, load_dotenv
-from fastapi import FastAPI, HTTPException
+from fastapi import FastAPI, HTTPException, status
+from fastapi.middleware.cors import CORSMiddleware
 from langchain.schema import Document
 from langchain_core.runnables.config import run_in_executor
 
 from models import DocumentModel, DocumentResponse
 from store import AsyncPgVector
 
-from config import pgvector_store
-
 load_dotenv(find_dotenv())
 
+from config import (
+    PDF_EXTRACT_IMAGES,
+    CHUNK_SIZE,
+    CHUNK_OVERLAP,
+    pgvector_store,
+    known_source_ext,
+    # RAG_EMBEDDING_MODEL,
+    # RAG_EMBEDDING_MODEL_DEVICE_TYPE,
+    # RAG_TEMPLATE,
+)
+
+from langchain_community.document_loaders import (
+    WebBaseLoader,
+    TextLoader,
+    PyPDFLoader,
+    CSVLoader,
+    Docx2txtLoader,
+    UnstructuredEPubLoader,
+    UnstructuredMarkdownLoader,
+    UnstructuredXMLLoader,
+    UnstructuredRSTLoader,
+    UnstructuredExcelLoader,
+)
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+from constants import ERROR_MESSAGES
+
 app = FastAPI()
 
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+app.state.CHUNK_SIZE = CHUNK_SIZE
+app.state.CHUNK_OVERLAP = CHUNK_OVERLAP
+app.state.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES
 
 def get_env_variable(var_name: str) -> str:
     value = os.getenv(var_name)
@@ -38,7 +76,7 @@ async def add_documents(documents: list[DocumentModel]):
         ids = (
             await pgvector_store.aadd_documents(docs, ids=[doc.id for doc in documents])
             if isinstance(pgvector_store, AsyncPgVector)
-            else pgvector_store.add_documents(docs)
+            else pgvector_store.add_documents(docs, ids=[doc.id for doc in documents])
         )
         return {"message": "Documents added successfully", "ids": ids}
     except Exception as e:
@@ -52,7 +90,7 @@ async def get_all_ids():
         else:
             ids = pgvector_store.get_all_ids()
 
-        return ids
+        return list(set(ids))
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 
@@ -119,3 +157,132 @@ async def query_embeddings_by_file_id(file_id: str, query: str, k: int = 4):
         return documents
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+
+
+def generate_digest(page_content: str):
+    hash_obj = hashlib.md5(page_content.encode())
+    return hash_obj.hexdigest()
+
+async def store_data_in_vector_db(data, file_id, overwrite: bool = False) -> bool:
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=app.state.CHUNK_SIZE, chunk_overlap=app.state.CHUNK_OVERLAP
+    )
+    documents = text_splitter.split_documents(data)
+
+
+  # Preparing documents with page content and metadata for insertion.
+    docs = [
+        Document(
+            page_content=doc.page_content,
+            metadata={
+                "file_id": file_id,
+                "digest": generate_digest(doc.page_content),
+                **(doc.metadata or {}),
+            },
+        )
+        for doc in documents
+    ]
+
+    try:
+        if isinstance(pgvector_store, AsyncPgVector):
+            ids = await pgvector_store.aadd_documents(docs, ids=[file_id]*len(documents))
+        else:
+            ids = pgvector_store.add_documents(docs, ids=[file_id]*len(documents))
+
+        return {"message": "Documents added successfully", "ids": ids}
+
+    except Exception as e:
+        print(e)
+        # Checking if a unique constraint error occurred, to handle overwrite logic if needed.
+        if e.__class__.__name__ == "UniqueConstraintError" and overwrite:
+            # Functionality to overwrite existing documents.
+            # This might require fetching existing document IDs, deleting them, and then re-inserting the documents.
+            return {"message": "Documents exist. Overwrite not implemented.", "error": str(e)}
+
+        return {"message": "An error occurred while adding documents.", "error": str(e)}
+
+
+
+def get_loader(filename: str, file_content_type: str, filepath: str):
+    file_ext = filename.split(".")[-1].lower()
+    known_type = True
+
+    if file_ext == "pdf":
+        loader = PyPDFLoader(filepath, extract_images=app.state.PDF_EXTRACT_IMAGES)
+    elif file_ext == "csv":
+        loader = CSVLoader(filepath)
+    elif file_ext == "rst":
+        loader = UnstructuredRSTLoader(filepath, mode="elements")
+    elif file_ext == "xml":
+        loader = UnstructuredXMLLoader(filepath)
+    elif file_ext == "md":
+        loader = UnstructuredMarkdownLoader(filepath)
+    elif file_content_type == "application/epub+zip":
+        loader = UnstructuredEPubLoader(filepath)
+    elif (
+        file_content_type
+        == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+        or file_ext in ["doc", "docx"]
+    ):
+        loader = Docx2txtLoader(filepath)
+    elif file_content_type in [
+        "application/vnd.ms-excel",
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    ] or file_ext in ["xls", "xlsx"]:
+        loader = UnstructuredExcelLoader(filepath)
+    elif file_ext in known_source_ext or (
+        file_content_type and file_content_type.find("text/") >= 0
+    ):
+        loader = TextLoader(filepath)
+    else:
+        loader = TextLoader(filepath)
+        known_type = False
+
+    return loader, known_type
+
+
+@app.post("/doc")
+async def store_doc(
+    filepath: str,
+    filename: str,
+    file_content_type: str,
+    file_id: str,
+):
+
+    # Check if the file exists
+    if not os.path.exists(filepath):
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=ERROR_MESSAGES.FILE_NOT_FOUND(),
+        )
+
+    try:
+        loader, known_type = get_loader(filename, file_content_type, filepath)
+        data = loader.load()
+        result = await store_data_in_vector_db(data, file_id)
+
+        if result:
+            return {
+                "status": True,
+                "file_id": file_id,
+                "filename": filename,
+                "known_type": known_type,
+            }
+        else:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=ERROR_MESSAGES.DEFAULT(),
+            )
+    except Exception as e:
+        print(e)
+        if "No pandoc was found" in str(e):
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
+            )
+        else:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=ERROR_MESSAGES.DEFAULT(e),
+            )
+
diff --git a/models.py b/models.py
@@ -10,7 +10,6 @@ class DocumentResponse(BaseModel):
 
 
 class DocumentModel(BaseModel):
-    id: str
     page_content: str
     metadata: Optional[dict] = {}
 

diff --git a/myrequest.py b/myrequest.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,7 +10,6 @@ class DocumentResponse(BaseModel): @@
     class DocumentModel(BaseModel):
-        id: str
         page_content: str
         metadata: Optional[dict] = {}
@@ Expand Down @@