-
Notifications
You must be signed in to change notification settings - Fork 6
/
utils.py
116 lines (99 loc) · 4.69 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import sys
import datetime as dt
from loguru import logger
from langchain_community.vectorstores.chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings
# local imports
import settings
def create_vectordb_name(content_folder_name, chunk_size=None, chunk_overlap=None):
content_folder_path = os.path.join(settings.DOC_DIR, content_folder_name)
# vectordb_name is created from vecdb_type, chunk_size, chunk_overlap, embeddings_type
if chunk_size:
vectordb_name = "_" + settings.VECDB_TYPE + "_" + str(chunk_size) + "_" + str(chunk_overlap) + "_" + settings.EMBEDDINGS_PROVIDER + "_" + settings.EMBEDDINGS_MODEL
else:
vectordb_name = "_" + settings.VECDB_TYPE + "_" + str(settings.CHUNK_SIZE) + "_" + str(settings.CHUNK_OVERLAP) + "_" + settings.EMBEDDINGS_PROVIDER + "_" + settings.EMBEDDINGS_MODEL
vectordb_folder_path = os.path.join(settings.VECDB_DIR, content_folder_name) + vectordb_name
return content_folder_path, vectordb_folder_path
def exit_program():
print("Exiting the program...")
sys.exit(0)
def getattr_or_default(obj, attr, default=None):
"""
Get an attribute from an object, returning a default value if the attribute
is not found or its value is None.
"""
value = getattr(obj, attr, default)
return value if value is not None else default
def get_chroma_vector_store(collection_name, embeddings, vectordb_folder):
vector_store = Chroma(
collection_name=collection_name,
embedding_function=embeddings,
persist_directory=vectordb_folder,
collection_metadata={"hnsw:space": "cosine"}
)
return vector_store
def get_settings_as_dictionary(file_name):
# Initialize an empty dictionary to store the variables and their values
variables_dict = {}
# Open and read the file
with open(file=file_name, mode='r', encoding="utf-8") as file:
lines = file.readlines()
start_reading = False
# Process each line in the file
for line in lines:
# start reading below the line with # #########
if line.startswith("# #########"):
start_reading = True
# ignore comment lines
if start_reading and not line.startswith("#"):
# Remove leading and trailing whitespace and split the line by '='
parts = line.strip().split('=')
# Check if the line can be split into two parts
if len(parts) == 2:
# Extract the variable name and value
variable_name = parts[0].strip()
variable_value = parts[1].strip()
# Use exec() to assign the value to the variable name
exec(f'{variable_name} = {variable_value}')
# Add the variable and its value to the dictionary
variables_dict[variable_name] = eval(variable_name)
return variables_dict
def getEmbeddings(embeddings_provider, embeddings_model, local_api_url, azureopenai_api_version):
# determine embeddings model
if embeddings_provider == "openai":
embeddings = OpenAIEmbeddings(model=embeddings_model, client=None)
logger.info("Loaded openai embeddings")
elif embeddings_provider == "huggingface":
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)
elif embeddings_provider == "local_embeddings":
model_name = embeddings_model
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs )
logger.info("Loaded local embeddings: " + embeddings_model)
elif embeddings_provider == "azureopenai":
logger.info("Retrieve " + embeddings_model)
embeddings = AzureOpenAIEmbeddings(
azure_deployment=embeddings_model,
openai_api_version=azureopenai_api_version,
azure_endpoint=local_api_url,
)
logger.info("Loaded Azure OpenAI embeddings")
return embeddings
def get_timestamp():
return str(dt.datetime.now())
def get_content_folder_name() -> str:
'''Select a folder from the DOC_DIR to work with.'''
path = settings.DOC_DIR
content_folder_names = [folder for folder in os.listdir(path) if os.path.isdir(os.path.join(path, folder))]
print(f"Available folders in {path}:")
for idx, folder in enumerate(content_folder_names, start=1):
print(f"{idx}. {folder}")
selection = int(input("Select a folder by number: ")) - 1
return content_folder_names[selection]