Skip to content

Commit

Permalink
feat: RAG pipeline first pass
Browse files Browse the repository at this point in the history
  • Loading branch information
danny-avila committed Mar 18, 2024
1 parent de34b67 commit eba72f3
Show file tree
Hide file tree
Showing 9 changed files with 271 additions and 37 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.venv
.env
__pycache__
__pycache__
uploads/
myenv/
15 changes: 15 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM python:3.10

WORKDIR /app

# Install pandoc and netcat
RUN apt-get update \
&& apt-get install -y pandoc netcat-openbsd \
&& rm -rf /var/lib/apt/lists/*

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
Binary file modified README.md
Binary file not shown.
51 changes: 50 additions & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,13 @@ def get_env_variable(var_name: str, default_value: str = None) -> str:
POSTGRES_PASSWORD = get_env_variable("POSTGRES_PASSWORD")
DB_HOST = get_env_variable("DB_HOST")
DB_PORT = get_env_variable("DB_PORT")
COLLECTION_NAME = get_env_variable("COLLECTION_NAME", "testcollection")

CHUNK_SIZE = int(get_env_variable("CHUNK_SIZE", "1500"))
CHUNK_OVERLAP = int(get_env_variable("CHUNK_OVERLAP", "100"))
UPLOAD_DIR = get_env_variable("UPLOAD_DIR", "./uploads/")
env_value = get_env_variable("PDF_EXTRACT_IMAGES", "False").lower()
PDF_EXTRACT_IMAGES = True if env_value == "true" else False

CONNECTION_STRING = f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{DB_HOST}:{DB_PORT}/{POSTGRES_DB}"

Expand All @@ -32,7 +36,52 @@ def get_env_variable(var_name: str, default_value: str = None) -> str:
pgvector_store = get_vector_store(
connection_string=CONNECTION_STRING,
embeddings=embeddings,
collection_name="testcollection",
collection_name=COLLECTION_NAME,
mode="async",
)
retriever = pgvector_store.as_retriever()

known_source_ext = [
"go",
"py",
"java",
"sh",
"bat",
"ps1",
"cmd",
"js",
"ts",
"css",
"cpp",
"hpp",
"h",
"c",
"cs",
"sql",
"log",
"ini",
"pl",
"pm",
"r",
"dart",
"dockerfile",
"env",
"php",
"hs",
"hsc",
"lua",
"nginxconf",
"conf",
"m",
"mm",
"plsql",
"perl",
"rb",
"rs",
"db2",
"scala",
"bash",
"swift",
"vue",
"svelte",
]
16 changes: 16 additions & 0 deletions constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from enum import Enum


class MESSAGES(str, Enum):
DEFAULT = lambda msg="": f"{msg if msg else ''}"


class ERROR_MESSAGES(str, Enum):
def __str__(self) -> str:
return super().__str__()

DEFAULT = lambda err="": f"Something went wrong :/\n{err if err else ''}"
PANDOC_NOT_INSTALLED = "Pandoc is not installed on the server. Please contact your administrator for assistance."
OPENAI_NOT_FOUND = lambda name="": f"OpenAI API was not found"
OLLAMA_NOT_FOUND = "WebUI could not connect to Ollama"
FILE_NOT_FOUND = "The specified file was not found."
16 changes: 15 additions & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,19 @@ services:
ports:
- "5433:5432"

fastapi:
build: .
environment:
- DB_HOST=db
- DB_PORT=5432
ports:
- "8000:8000"
volumes:
- ./uploads:/app/uploads
depends_on:
- db
env_file:
- .env

volumes:
pgdata2:
pgdata2:
177 changes: 172 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,57 @@
import os

import hashlib
from dotenv import find_dotenv, load_dotenv
from fastapi import FastAPI, HTTPException
from fastapi import FastAPI, HTTPException, status
from fastapi.middleware.cors import CORSMiddleware
from langchain.schema import Document
from langchain_core.runnables.config import run_in_executor

from models import DocumentModel, DocumentResponse
from store import AsyncPgVector

from config import pgvector_store

load_dotenv(find_dotenv())

from config import (
PDF_EXTRACT_IMAGES,
CHUNK_SIZE,
CHUNK_OVERLAP,
pgvector_store,
known_source_ext,
# RAG_EMBEDDING_MODEL,
# RAG_EMBEDDING_MODEL_DEVICE_TYPE,
# RAG_TEMPLATE,
)

from langchain_community.document_loaders import (
WebBaseLoader,
TextLoader,
PyPDFLoader,
CSVLoader,
Docx2txtLoader,
UnstructuredEPubLoader,
UnstructuredMarkdownLoader,
UnstructuredXMLLoader,
UnstructuredRSTLoader,
UnstructuredExcelLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter

from constants import ERROR_MESSAGES

app = FastAPI()

app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)

app.state.CHUNK_SIZE = CHUNK_SIZE
app.state.CHUNK_OVERLAP = CHUNK_OVERLAP
app.state.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES

def get_env_variable(var_name: str) -> str:
value = os.getenv(var_name)
Expand All @@ -38,7 +76,7 @@ async def add_documents(documents: list[DocumentModel]):
ids = (
await pgvector_store.aadd_documents(docs, ids=[doc.id for doc in documents])
if isinstance(pgvector_store, AsyncPgVector)
else pgvector_store.add_documents(docs)
else pgvector_store.add_documents(docs, ids=[doc.id for doc in documents])
)
return {"message": "Documents added successfully", "ids": ids}
except Exception as e:
Expand All @@ -52,7 +90,7 @@ async def get_all_ids():
else:
ids = pgvector_store.get_all_ids()

return ids
return list(set(ids))
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

Expand Down Expand Up @@ -119,3 +157,132 @@ async def query_embeddings_by_file_id(file_id: str, query: str, k: int = 4):
return documents
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))


def generate_digest(page_content: str):
hash_obj = hashlib.md5(page_content.encode())
return hash_obj.hexdigest()

async def store_data_in_vector_db(data, file_id, overwrite: bool = False) -> bool:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=app.state.CHUNK_SIZE, chunk_overlap=app.state.CHUNK_OVERLAP
)
documents = text_splitter.split_documents(data)


# Preparing documents with page content and metadata for insertion.
docs = [
Document(
page_content=doc.page_content,
metadata={
"file_id": file_id,
"digest": generate_digest(doc.page_content),
**(doc.metadata or {}),
},
)
for doc in documents
]

try:
if isinstance(pgvector_store, AsyncPgVector):
ids = await pgvector_store.aadd_documents(docs, ids=[file_id]*len(documents))
else:
ids = pgvector_store.add_documents(docs, ids=[file_id]*len(documents))

return {"message": "Documents added successfully", "ids": ids}

except Exception as e:
print(e)
# Checking if a unique constraint error occurred, to handle overwrite logic if needed.
if e.__class__.__name__ == "UniqueConstraintError" and overwrite:
# Functionality to overwrite existing documents.
# This might require fetching existing document IDs, deleting them, and then re-inserting the documents.
return {"message": "Documents exist. Overwrite not implemented.", "error": str(e)}

return {"message": "An error occurred while adding documents.", "error": str(e)}



def get_loader(filename: str, file_content_type: str, filepath: str):
file_ext = filename.split(".")[-1].lower()
known_type = True

if file_ext == "pdf":
loader = PyPDFLoader(filepath, extract_images=app.state.PDF_EXTRACT_IMAGES)
elif file_ext == "csv":
loader = CSVLoader(filepath)
elif file_ext == "rst":
loader = UnstructuredRSTLoader(filepath, mode="elements")
elif file_ext == "xml":
loader = UnstructuredXMLLoader(filepath)
elif file_ext == "md":
loader = UnstructuredMarkdownLoader(filepath)
elif file_content_type == "application/epub+zip":
loader = UnstructuredEPubLoader(filepath)
elif (
file_content_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
or file_ext in ["doc", "docx"]
):
loader = Docx2txtLoader(filepath)
elif file_content_type in [
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
] or file_ext in ["xls", "xlsx"]:
loader = UnstructuredExcelLoader(filepath)
elif file_ext in known_source_ext or (
file_content_type and file_content_type.find("text/") >= 0
):
loader = TextLoader(filepath)
else:
loader = TextLoader(filepath)
known_type = False

return loader, known_type


@app.post("/doc")
async def store_doc(
filepath: str,
filename: str,
file_content_type: str,
file_id: str,
):

# Check if the file exists
if not os.path.exists(filepath):
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.FILE_NOT_FOUND(),
)

try:
loader, known_type = get_loader(filename, file_content_type, filepath)
data = loader.load()
result = await store_data_in_vector_db(data, file_id)

if result:
return {
"status": True,
"file_id": file_id,
"filename": filename,
"known_type": known_type,
}
else:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=ERROR_MESSAGES.DEFAULT(),
)
except Exception as e:
print(e)
if "No pandoc was found" in str(e):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
)
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT(e),
)

1 change: 0 additions & 1 deletion models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ class DocumentResponse(BaseModel):


class DocumentModel(BaseModel):
id: str
page_content: str
metadata: Optional[dict] = {}

Expand Down
28 changes: 0 additions & 28 deletions myrequest.py

This file was deleted.

0 comments on commit eba72f3

Please sign in to comment.