Skip to content

Commit

Permalink
add PDF reading to summarizer
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Clifford <[email protected]>
  • Loading branch information
MichaelClifford committed Apr 22, 2024
1 parent 2f67b82 commit 88ea707
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 6 deletions.
Binary file added data/fake_meeting.pdf
Binary file not shown.
31 changes: 25 additions & 6 deletions recipes/natural_language_processing/summarizer/app/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_community.callbacks import StreamlitCallbackHandler
from langchain_community.document_loaders import PyPDFLoader
import streamlit as st
import requests
import time
Expand Down Expand Up @@ -44,14 +45,30 @@ def chunk_text(text):
text_list = text_list[chunk_size:]
return chunks

def read_file(file):
file_type = file.type

if file_type == "application/pdf":
with open("tmp_pdf", "wb") as f:
f.write(file.getvalue())
loader = PyPDFLoader("tmp_pdf",)
pages = loader.load()
text = "".join([p.page_content for p in pages])
os.remove("tmp_pdf")

if file_type == "text/plain":
text = file.read().decode()

return text

st.title("🔎 Summarizer")
file = st.file_uploader("Upload file")
file = st.file_uploader("Upload file",type=[".txt",".pdf"])

llm = ChatOpenAI(base_url=model_service,
api_key="not required",
streaming=True,
max_tokens=200,
temperature=0
temperature=0.0,
max_tokens=400,
)

### prompt example is from https://python.langchain.com/docs/use_cases/summarization
Expand All @@ -68,14 +85,16 @@ def chunk_text(text):
"Only use bullet points."
"Dont ever go beyond 10 bullet points."
)

if file != None:
text = file.read().decode()

if file != None:

text = read_file(file)
chunks = chunk_text(text)
num_chunks = len(chunks)
st.write(f"Processing data in {num_chunks} chunks...")
progbar = st.progress(0.01, text="")
existing_answer = ""

for i, chunk in enumerate(chunks):
progbar.progress((i+1)/(num_chunks), text="")
if i+1 < num_chunks:
Expand Down

0 comments on commit 88ea707

Please sign in to comment.