Add extract pdf content function (#3)

Maclenn77 · Dec 8, 2023 · 7a95605 · 7a95605
1 parent f681f38
commit 7a95605
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -14,9 +14,15 @@ An Intelligent Assistant that explains you the content of a PDF file
 
 ## Deployment 
 
-Deploy in HF with Streamlit
+Deploy in HF with Streamlit-
+
+## Local
+
+Run streamlit run app.py
 
 ## Stack
 
 - Streamlit
 - HuggingFace
+- Tika: For extracting pdf text
+- Java Runtime
diff --git a/app.py b/app.py
@@ -1,5 +1,12 @@
 """ A simple example of Streamlit. """
 import streamlit as st
+from tika import parser
 
-x = st.slider("Select a value")
-st.write(x, "squared is", x * x)
+pdf = st.file_uploader("Upload a file", type="pdf")
+
+if st.button("Extract text"):
+    if pdf is not None:
+        extracted_text = parser.from_file(pdf)
+        st.write(extracted_text["content"])
+    else:
+        st.write("Please upload a file of type: pdf")
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 openai
 langchain
-pdfminer
+tika
 chromadb
 sentence_transformers
 streamlit
diff --git a/wk_flow_requirements.txt b/wk_flow_requirements.txt
@@ -1,2 +1,3 @@
 streamlit
+tika
 pylint