From f681f384c76418cbb6900f9b061d32e53591e950 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Paulo=20P=C3=A9rez-Tejada?= Date: Fri, 8 Dec 2023 12:16:26 -0600 Subject: [PATCH 1/2] fix: license on readme file --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 904a783..d66de3f 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ colorTo: pink sdk: streamlit app_file: app.py pinned: false -license: MIT +license: mit --- # pdf-explainer From 7a956056417aed3da09d2ae1d88fe1e45b749f19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Paulo=20P=C3=A9rez-Tejada?= Date: Fri, 8 Dec 2023 13:07:56 -0600 Subject: [PATCH 2/2] Add extract pdf content function (#3) --- README.md | 8 +++++++- app.py | 11 +++++++++-- requirements.txt | 2 +- wk_flow_requirements.txt | 1 + 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d66de3f..f17b0f8 100644 --- a/README.md +++ b/README.md @@ -14,9 +14,15 @@ An Intelligent Assistant that explains you the content of a PDF file ## Deployment -Deploy in HF with Streamlit +Deploy in HF with Streamlit- + +## Local + +Run streamlit run app.py ## Stack - Streamlit - HuggingFace +- Tika: For extracting pdf text +- Java Runtime diff --git a/app.py b/app.py index c8a975b..70b36d9 100644 --- a/app.py +++ b/app.py @@ -1,5 +1,12 @@ """ A simple example of Streamlit. """ import streamlit as st +from tika import parser -x = st.slider("Select a value") -st.write(x, "squared is", x * x) +pdf = st.file_uploader("Upload a file", type="pdf") + +if st.button("Extract text"): + if pdf is not None: + extracted_text = parser.from_file(pdf) + st.write(extracted_text["content"]) + else: + st.write("Please upload a file of type: pdf") diff --git a/requirements.txt b/requirements.txt index 81bee28..c440f6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ openai langchain -pdfminer +tika chromadb sentence_transformers streamlit \ No newline at end of file diff --git a/wk_flow_requirements.txt b/wk_flow_requirements.txt index e74bbd2..2dbfd41 100644 --- a/wk_flow_requirements.txt +++ b/wk_flow_requirements.txt @@ -1,2 +1,3 @@ streamlit +tika pylint \ No newline at end of file