diff --git a/.gitignore b/.gitignore index 68bc17f..66e8f66 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +tmp/ \ No newline at end of file diff --git a/README.md b/README.md index 0324b73..2135f3f 100644 --- a/README.md +++ b/README.md @@ -24,5 +24,6 @@ Run streamlit run app.py - Streamlit - HuggingFace +- ChromaDB - pymupdf for pdf extraction - An open ai openapi key diff --git a/app.py b/app.py index 49bd53d..0e8a5c9 100644 --- a/app.py +++ b/app.py @@ -1,18 +1,53 @@ """ A simple example of Streamlit. """ -import streamlit as st +from datetime import datetime as Date +import chromadb import fitz +import streamlit as st -# from tika import parser # from openai import OpenAI -# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) +chroma_client = chromadb.PersistentClient(path="tmp/chroma") +chroma_client.heartbeat() + +collection = chroma_client.get_or_create_collection("pdf-explainer") + +# Query ChromaDb +query = st.text_input("Query ChromaDb", value="", placeholder="Enter query") +if st.button("Search"): + results = collection.query( + query_texts=[query], + n_results=3, + ) + + for idx, result in enumerate(results["documents"][0]): + st.markdown( + result[0:150] + + "..." + + "**Source:** " + + results["metadatas"][0][idx]["source"] + ) + pdf = st.file_uploader("Upload a file", type="pdf") -if st.button("Extract text"): + +if st.button("Save"): if pdf is not None: with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document text = chr(12).join([page.get_text() for page in doc]) - st.write(text) + st.write(text[0:200]) + collection.add( + documents=[text], + metadatas=[{"source": pdf.name}], + ids=[pdf.name + str(Date.now())], + ) else: st.write("Please upload a file of type: pdf") + + +if st.button("Chroma data collection"): + st.write(collection) + +if st.button("Delete Chroma Collection"): + chroma_client.delete_collection(collection.name) + st.write("Deleted Chroma Collection") diff --git a/requirements.txt b/requirements.txt index 9b9fde5..bcbff57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ openai +tiktoken langchain pymupdf +pypdf chromadb sentence_transformers streamlit \ No newline at end of file diff --git a/wk_flow_requirements.txt b/wk_flow_requirements.txt index 7a2e995..2389c28 100644 --- a/wk_flow_requirements.txt +++ b/wk_flow_requirements.txt @@ -1,3 +1,4 @@ streamlit pymupdf -pylint \ No newline at end of file +pylint +chromadb \ No newline at end of file