From ca67687ce2645d052e63e1bbdb8fa1cddbcb4e1b Mon Sep 17 00:00:00 2001 From: Juan Perez Tejada Date: Fri, 8 Dec 2023 20:37:56 -0600 Subject: [PATCH] Create embeddings with OpenAI --- app.py | 65 +++++++++++++++++++++++++++++++++------- requirements.txt | 2 +- wk_flow_requirements.txt | 2 +- 3 files changed, 56 insertions(+), 13 deletions(-) diff --git a/app.py b/app.py index 0e8a5c9..cce7025 100644 --- a/app.py +++ b/app.py @@ -1,15 +1,36 @@ """ A simple example of Streamlit. """ from datetime import datetime as Date +import textwrap +import tiktoken import chromadb +from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction import fitz import streamlit as st +import openai +import os +from dotenv import load_dotenv +from openai import OpenAI +load_dotenv() + +if os.getenv("OPENAI_API_KEY") is None: + st.error("Please set OPENAI_API_KEY environment variable") + st.stop() +else: + openai.api_key = os.getenv("OPENAI_API_KEY") + +client = OpenAI() +embedding_function = OpenAIEmbeddingFunction( + api_key=openai.api_key, model_name="text-embedding-ada-002" +) # from openai import OpenAI chroma_client = chromadb.PersistentClient(path="tmp/chroma") chroma_client.heartbeat() -collection = chroma_client.get_or_create_collection("pdf-explainer") +collection = chroma_client.get_or_create_collection( + name="pdf-explainer", embedding_function=embedding_function +) # Query ChromaDb query = st.text_input("Query ChromaDb", value="", placeholder="Enter query") @@ -25,29 +46,51 @@ + "..." + "**Source:** " + results["metadatas"][0][idx]["source"] + + " **Tokens:** " + + str(results["metadatas"][0][idx]["num_tokens"]) ) pdf = st.file_uploader("Upload a file", type="pdf") - -if st.button("Save"): - if pdf is not None: - with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document - text = chr(12).join([page.get_text() for page in doc]) - st.write(text[0:200]) +if pdf is not None: + with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document + text = chr(12).join([page.get_text() for page in doc]) + st.write(text[0:200]) + if st.button("Add to collection"): collection.add( documents=[text], metadatas=[{"source": pdf.name}], ids=[pdf.name + str(Date.now())], ) - else: - st.write("Please upload a file of type: pdf") + if st.button("Save chunks"): + with st.spinner("Saving chunks..."): + chunks = textwrap.wrap(text, 24000) + for idx, chunk in enumerate(chunks): + encoding = tiktoken.get_encoding("cl100k_base") + num_tokens = len(encoding.encode(chunk)) + response = ( + client.embeddings.create( + input=chunk, model="text-embedding-ada-002" + ) + .data[0] + .embedding + ) + collection.add( + embeddings=[response], + documents=[chunk], + metadatas=[{"source": pdf.name, "num_tokens": num_tokens}], + ids=[pdf.name + str(idx)], + ) +else: + st.write("Please upload a file of type: pdf") if st.button("Chroma data collection"): st.write(collection) if st.button("Delete Chroma Collection"): - chroma_client.delete_collection(collection.name) - st.write("Deleted Chroma Collection") + try: + chroma_client.delete_collection(collection.name) + except AttributeError: + st.error("Collection erased.") diff --git a/requirements.txt b/requirements.txt index bcbff57..aae53b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,6 @@ tiktoken langchain pymupdf pypdf -chromadb +chromadb>='0.4.18' sentence_transformers streamlit \ No newline at end of file diff --git a/wk_flow_requirements.txt b/wk_flow_requirements.txt index 2389c28..be29b87 100644 --- a/wk_flow_requirements.txt +++ b/wk_flow_requirements.txt @@ -1,4 +1,4 @@ streamlit pymupdf pylint -chromadb \ No newline at end of file +chromadb>='0.4.18'