From 8237aaa90489f576c60a4f2a8709173095b57e7b Mon Sep 17 00:00:00 2001 From: Sayak Chakrabarty Date: Mon, 15 Apr 2024 06:02:00 +0000 Subject: [PATCH] updates --- doc_generator/Utils/HNSW.py | 89 -------------------------- doc_generator/Utils/HNSWLib.py | 111 +++++++++++++++++++++++++++++++++ doc_generator/Utils/index.py | 4 +- 3 files changed, 113 insertions(+), 91 deletions(-) delete mode 100644 doc_generator/Utils/HNSW.py create mode 100644 doc_generator/Utils/HNSWLib.py diff --git a/doc_generator/Utils/HNSW.py b/doc_generator/Utils/HNSW.py deleted file mode 100644 index 3a63bec..0000000 --- a/doc_generator/Utils/HNSW.py +++ /dev/null @@ -1,89 +0,0 @@ -import json -import os -from pathlib import Path -from abc import abstractmethod -import hnswlib -from langchain_community.docstore.in_memory import InMemoryDocstore -from langchain_community.embeddings import OpenAIEmbeddings -from langchain_core.vectorstores import VectorStore - - -class SaveableVectorStore(VectorStore): - - @abstractmethod - def save(self, directory): - pass - - -class HNSW(SaveableVectorStore): - - def __init__(self, - embeddings, - space, - num_dimensions=None, - docstore=None, - index=None): - super().__init__(embeddings) - self.space = space - self.num_dimensions = num_dimensions or embeddings.get_dimensions() - self.docstore = docstore or InMemoryDocstore() - self.index = index or self.init_index() - - def init_index(self): - index = hnswlib.Index(space=self.space, dim=self.num_dimensions) - index.init_index(max_elements=10000, ef_construction=200, M=16) - return index - - def add_documents(self, documents): - texts = [doc.page_content for doc in documents] - vectors = self.embeddings.embed_documents(texts) - self.add_vectors(vectors, documents) - - def add_vectors(self, vectors, documents): - if not self.index: - self.init_index() - if len(vectors) != len(documents): - raise ValueError("Vectors and documents must have the same length") - self.index.add_items(vectors, ids=[i for i in range(len(documents))]) - for i, doc in enumerate(documents): - self.docstore.add(doc, i) - - def similarity_search_vector_with_score(self, query, k): - labels, distances = self.index.knn_query(query, k=k) - return [(self.docstore.get(label[0]), distance[0]) - for label, distance in zip(labels, distances)] - - def save(self, directory): - os.makedirs(directory, exist_ok=True) - self.index.save_index(str(Path(directory) / "hnswlib.index")) - with open(Path(directory) / "args.json", 'w') as f: - json.dump({ - 'space': self.space, - 'num_dimensions': self.num_dimensions - }, f) - with open(Path(directory) / "docstore.json", 'w') as f: - json.dump(self.docstore.dump(), f) - - @staticmethod - def load(directory, embeddings): - with open(Path(directory) / "args.json", 'r') as f: - args = json.load(f) - with open(Path(directory) / "docstore.json", 'r') as f: - documents = json.load(f) - docstore = InMemoryDocstore.load(documents) - index = hnswlib.Index(space=args['space'], dim=args['num_dimensions']) - index.load_index(str(Path(directory) / "hnswlib.index"), - max_elements=10000) - return HNSW(embeddings, args['space'], args['num_dimensions'], docstore, - index) - - -# Example usage -if __name__ == "__main__": - embeddings = Embeddings() # Placeholder for actual embeddings implementation - hnsw = HNSW(embeddings, 'cosine') - # Assume Document class and documents initialization here - documents = [] # This should be a list of Document instances - hnsw.add_documents(documents) - query_vector = [0.1] * hnsw.num_dimensions # Example query vector - print(hnsw.similarity_search_vector_with_score(query_vector, 5)) diff --git a/doc_generator/Utils/HNSWLib.py b/doc_generator/Utils/HNSWLib.py new file mode 100644 index 0000000..2ed9302 --- /dev/null +++ b/doc_generator/Utils/HNSWLib.py @@ -0,0 +1,111 @@ +import json +import os +import hnswlib +from typing import List, Optional +from langchain_community.docstore.in_memory import InMemoryDocstore +from langchain_community.embeddings import OpenAIEmbeddings +from langchain_core.vectorstores import VectorStore +from abc import abstractmethod + +class SaveableVectorStore(VectorStore): + + @abstractmethod + def save(self, directory): + pass + +class HNSWLibBase: + def __init__(self, space: str, num_dimensions: Optional[int] = None): + self.space = space + self.num_dimensions = num_dimensions + +class HNSWLibArgs(HNSWLibBase): + def __init__(self, space: str, num_dimensions: Optional[int] = None, docstore: Optional[InMemoryDocstore] = None, index: Optional[hnswlib.Index] = None): + super().__init__(space, num_dimensions) + self.docstore = docstore + self.index = index + +class HNSWLib(SaveableVectorStore): + def __init__(self, embeddings: OpenAIEmbeddings, args: 'HNSWLibArgs'): + super().__init__(embeddings, args) + self._index = args.index + self.docstore = args.docstore if args.docstore else InMemoryDocstore() + + def add_documents(self, documents: List): + texts = [doc.page_content for doc in documents] + vectors = self.embeddings.embed_documents(texts) + self.add_vectors(vectors, documents) + + @staticmethod + def get_hierarchical_nsw(args: 'HNSWLibArgs'): + if args.space is None: + raise ValueError('hnswlib-node requires a space argument') + if args.num_dimensions is None: + raise ValueError('hnswlib-node requires a num_dimensions argument') + return hnswlib.Index(space=args.space, dim=args.num_dimensions) + + def init_index(self, vectors: List[List[float]]): + if not self._index: + if self.args.num_dimensions is None: + self.args.num_dimensions = len(vectors[0]) + self.index = HNSWLib.get_hierarchical_nsw(self.args) + if not self.index.get_current_count(): + self.index.init_index(len(vectors)) + + @property + def index(self) -> hnswlib.Index: + if not self._index: + raise Exception('Vector store not initialized yet. Try calling `add_documents` first.') + return self._index + + @index.setter + def index(self, value: hnswlib.Index): + self._index = value + + def add_vectors(self, vectors: List[List[float]], documents: List): + if not vectors: + return + self.init_index(vectors) + if len(vectors) != len(documents): + raise ValueError("Vectors and documents must have the same length") + if len(vectors[0]) != self.args.num_dimensions: + raise ValueError(f"Vectors must have the same length as the number of dimensions ({self.args.num_dimensions})") + capacity = self.index.get_max_elements() + needed = self.index.get_current_count() + len(vectors) + if needed > capacity: + self.index.resize_index(needed) + for i, vector in enumerate(vectors): + self.index.add_items([vector], [self.docstore.count + i]) + self.docstore.add(self.docstore.count + i, documents[i]) + + def similarity_search_vector_with_score(self, query: List[float], k: int) -> List: + if len(query) != self.args.num_dimensions: + raise ValueError(f"Query vector must have the same length as the number of dimensions ({self.args.num_dimensions})") + total = self.index.get_current_count() + if k > total: + print(f"k ({k}) is greater than the number of elements in the index ({total}), setting k to {total}") + k = total + labels, distances = self.index.knn_query(query, k) + return [(self.docstore.search(str(label)), distance) for label, distance in zip(labels, distances)] + + def save(self, directory: str): + if not os.path.exists(directory): + os.makedirs(directory) + self.index.save_index(os.path.join(directory, 'hnswlib.index')) + with open(os.path.join(directory, 'docstore.json'), 'w') as f: + json.dump(self.docstore._docs, f) + with open(os.path.join(directory, 'args.json'), 'w') as f: + json.dump({'space': self.args.space, 'num_dimensions': self.args.num_dimensions}, f) + + @staticmethod + def load(directory: str, embeddings: OpenAIEmbeddings): + with open(os.path.join(directory, 'args.json'), 'r') as f: + args_data = json.load(f) + args = HNSWLibArgs(space=args_data['space'], num_dimensions=args_data['num_dimensions']) + index = hnswlib.Index(space=args.space, dim=args.num_dimensions) + index.load_index(os.path.join(directory, 'hnswlib.index')) + with open(os.path.join(directory, 'docstore.json'), 'r') as f: + doc_data = json.load(f) + args.docstore = InMemoryDocstore() + args.docstore.add(doc_data) + args.index = index + return HNSWLib(embeddings, args) diff --git a/doc_generator/Utils/index.py b/doc_generator/Utils/index.py index 0c7245d..9ce3cea 100644 --- a/doc_generator/Utils/index.py +++ b/doc_generator/Utils/index.py @@ -2,7 +2,7 @@ from prompt_toolkit.shortcuts import clear import os from langchain_community.embeddings import OpenAIEmbeddings -import HNSW +import HNSWLib from markdown2 import markdown from createChatChain import make_chain @@ -15,7 +15,7 @@ def display_welcome_message(project_name): def query(name, repository_url, output, content_type, chat_prompt, target_audience, llms): data_path = os.path.join(output, 'docs', 'data') embeddings = OpenAIEmbeddings() - vector_store = HNSW.load(data_path, embeddings) + vector_store = HNSWLib.load(data_path, embeddings) chain = make_chain(name, repository_url, content_type, chat_prompt, target_audience, vector_store, llms) clear()