Change pdf extractor for using with HF

Maclenn77 · Dec 8, 2023 · 17cda42 · 17cda42
1 parent 438a5f9
commit 17cda42
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -24,5 +24,5 @@ Run streamlit run app.py
 
 - Streamlit
 - HuggingFace
-- Tika: For extracting pdf text
-- Java Runtime
+- pymupdf for pdf extraction
+- An open ai openapi key
diff --git a/app.py b/app.py
@@ -1,12 +1,18 @@
 """ A simple example of Streamlit. """
 import streamlit as st
-from tika import parser
+import fitz
+
+# from tika import parser
+# from openai import OpenAI
+
+# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
 pdf = st.file_uploader("Upload a file", type="pdf")
 
 if st.button("Extract text"):
     if pdf is not None:
-        extracted_text = parser.from_file(pdf)
-        st.write(extracted_text["content"])
+        with fitz.open(stream=pdf.read(), filetype="pdf") as doc:  # open document
+            text = chr(12).join([page.get_text() for page in doc])
+            st.write(text)
     else:
         st.write("Please upload a file of type: pdf")
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 openai
 langchain
-tika
+pymupdf
 chromadb
 sentence_transformers
 streamlit
diff --git a/wk_flow_requirements.txt b/wk_flow_requirements.txt
@@ -1,3 +1,3 @@
 streamlit
-tika
+pymupdf
 pylint