forked from saxenaakansha30/documentor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
chunk_vector_store.py
22 lines (19 loc) · 915 Bytes
/
chunk_vector_store.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.utils import filter_complex_metadata
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import pgvector
from langchain_community.embeddings import fastembed
from config import Config
class ChunkVectorStore:
def split_into_chunks(self, file_path: str):
doc = PyPDFLoader(file_path).load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=20)
chunks = text_splitter.split_documents(doc)
chunks = filter_complex_metadata(chunks)
return chunks
def store_to_vector_database(self, chunks):
return pgvector.PGVector.from_documents(
documents=chunks,
embedding=fastembed.FastEmbedEmbeddings(),
connection_string=Config.get_db_connection_string()
)