Skip to content

Commit

Permalink
Save pdf content in chroma collection (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
Maclenn77 authored Dec 9, 2023
1 parent 7eeda1a commit 39103b5
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 6 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,5 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

tmp/
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ Run streamlit run app.py

- Streamlit
- HuggingFace
- ChromaDB
- pymupdf for pdf extraction
- An open ai openapi key
45 changes: 40 additions & 5 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,53 @@
""" A simple example of Streamlit. """
import streamlit as st
from datetime import datetime as Date
import chromadb
import fitz
import streamlit as st

# from tika import parser
# from openai import OpenAI

# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
chroma_client = chromadb.PersistentClient(path="tmp/chroma")
chroma_client.heartbeat()

collection = chroma_client.get_or_create_collection("pdf-explainer")

# Query ChromaDb
query = st.text_input("Query ChromaDb", value="", placeholder="Enter query")
if st.button("Search"):
results = collection.query(
query_texts=[query],
n_results=3,
)

for idx, result in enumerate(results["documents"][0]):
st.markdown(
result[0:150]
+ "..."
+ "**Source:** "
+ results["metadatas"][0][idx]["source"]
)


pdf = st.file_uploader("Upload a file", type="pdf")

if st.button("Extract text"):

if st.button("Save"):
if pdf is not None:
with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document
text = chr(12).join([page.get_text() for page in doc])
st.write(text)
st.write(text[0:200])
collection.add(
documents=[text],
metadatas=[{"source": pdf.name}],
ids=[pdf.name + str(Date.now())],
)
else:
st.write("Please upload a file of type: pdf")


if st.button("Chroma data collection"):
st.write(collection)

if st.button("Delete Chroma Collection"):
chroma_client.delete_collection(collection.name)
st.write("Deleted Chroma Collection")
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
openai
tiktoken
langchain
pymupdf
pypdf
chromadb
sentence_transformers
streamlit
3 changes: 2 additions & 1 deletion wk_flow_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
streamlit
pymupdf
pylint
pylint
chromadb

0 comments on commit 39103b5

Please sign in to comment.