Skip to content

Commit

Permalink
Create embeddings with OpenAI (#6)
Browse files Browse the repository at this point in the history
  • Loading branch information
Maclenn77 authored Dec 9, 2023
1 parent 39103b5 commit 9dddd1e
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 13 deletions.
66 changes: 55 additions & 11 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,36 @@
""" A simple example of Streamlit. """
from datetime import datetime as Date
import textwrap
import os
import tiktoken
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
import fitz
import streamlit as st
import openai
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

if os.getenv("OPENAI_API_KEY") is None:
st.error("Please set OPENAI_API_KEY environment variable")
st.stop()
else:
openai.api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI()
embedding_function = OpenAIEmbeddingFunction(
api_key=openai.api_key, model_name="text-embedding-ada-002"
)
# from openai import OpenAI

chroma_client = chromadb.PersistentClient(path="tmp/chroma")
chroma_client.heartbeat()

collection = chroma_client.get_or_create_collection("pdf-explainer")
collection = chroma_client.get_or_create_collection(
name="pdf-explainer", embedding_function=embedding_function
)

# Query ChromaDb
query = st.text_input("Query ChromaDb", value="", placeholder="Enter query")
Expand All @@ -25,29 +46,52 @@
+ "..."
+ "**Source:** "
+ results["metadatas"][0][idx]["source"]
+ " **Tokens:** "
+ str(results["metadatas"][0][idx]["num_tokens"])
)


pdf = st.file_uploader("Upload a file", type="pdf")


if st.button("Save"):
if pdf is not None:
with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document
text = chr(12).join([page.get_text() for page in doc])
st.write(text[0:200])
if pdf is not None:
with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document
text = chr(12).join([page.get_text() for page in doc])
st.write(text[0:200])
if st.button("Add to collection"):
collection.add(
documents=[text],
metadatas=[{"source": pdf.name}],
ids=[pdf.name + str(Date.now())],
)
else:
st.write("Please upload a file of type: pdf")
if st.button("Save chunks"):
with st.spinner("Saving chunks..."):
chunks = textwrap.wrap(text, 24000)
for idx, chunk in enumerate(chunks):
encoding = tiktoken.get_encoding("cl100k_base")
num_tokens = len(encoding.encode(chunk))
response = (
client.embeddings.create(
input=chunk, model="text-embedding-ada-002"
)
.data[0]
.embedding
)
collection.add(
embeddings=[response],
documents=[chunk],
metadatas=[{"source": pdf.name, "num_tokens": num_tokens}],
ids=[pdf.name + str(idx)],
)
else:
st.write("Please upload a file of type: pdf")


if st.button("Chroma data collection"):
st.write(collection)

if st.button("Delete Chroma Collection"):
chroma_client.delete_collection(collection.name)
st.write("Deleted Chroma Collection")
try:
chroma_client.delete_collection(collection.name)
except AttributeError:
st.error("Collection erased.")

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ tiktoken
langchain
pymupdf
pypdf
chromadb
chromadb>='0.4.18'
sentence_transformers
streamlit
5 changes: 4 additions & 1 deletion wk_flow_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
streamlit
pymupdf
openai
tiktoken
pylint
chromadb
langchain
chromadb>='0.4.18'

0 comments on commit 9dddd1e

Please sign in to comment.