diff --git a/README.md b/README.md index c4b20e4..de51c47 100644 --- a/README.md +++ b/README.md @@ -24,5 +24,5 @@ Run streamlit run app.py - Streamlit - HuggingFace -- Tika: For extracting pdf text -- Java Runtime +- pymupdf for pdf extraction +- An open ai openapi key diff --git a/app.py b/app.py index 70b36d9..49bd53d 100644 --- a/app.py +++ b/app.py @@ -1,12 +1,18 @@ """ A simple example of Streamlit. """ import streamlit as st -from tika import parser +import fitz + +# from tika import parser +# from openai import OpenAI + +# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) pdf = st.file_uploader("Upload a file", type="pdf") if st.button("Extract text"): if pdf is not None: - extracted_text = parser.from_file(pdf) - st.write(extracted_text["content"]) + with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document + text = chr(12).join([page.get_text() for page in doc]) + st.write(text) else: st.write("Please upload a file of type: pdf") diff --git a/requirements.txt b/requirements.txt index c440f6b..9b9fde5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ openai langchain -tika +pymupdf chromadb sentence_transformers streamlit \ No newline at end of file diff --git a/wk_flow_requirements.txt b/wk_flow_requirements.txt index 2dbfd41..7a2e995 100644 --- a/wk_flow_requirements.txt +++ b/wk_flow_requirements.txt @@ -1,3 +1,3 @@ streamlit -tika +pymupdf pylint \ No newline at end of file