diff --git a/projects/Basic-RAG/BasciRAG_CustomPrompt.py b/projects/Basic-RAG/BasciRAG_CustomPrompt.py deleted file mode 100644 index 4222aad..0000000 --- a/projects/Basic-RAG/BasciRAG_CustomPrompt.py +++ /dev/null @@ -1,13 +0,0 @@ -from grag.rag.basic_rag import BasicRAG -from grap.components.prompt import Prompt - -custom_prompt = Prompt( - input_keys={"context", "question"}, - template='''Answer the following question based on the given context. - question: {question} - context: {context} - answer: - ''' -) -rag = BasicRAG(doc_chain="stuff", - custom_prompt=custom_prompt) diff --git a/projects/Basic-RAG/BasicRAG-ingest_data.py b/projects/Basic-RAG/BasicRAG-ingest_data.py deleted file mode 100644 index 5d2417b..0000000 --- a/projects/Basic-RAG/BasicRAG-ingest_data.py +++ /dev/null @@ -1,87 +0,0 @@ -import os -from pathlib import Path -from uuid import UUID, uuid5 - -from grag.components.multivec_retriever import Retriever -from grag.components.parse_pdf import ParsePDF -from grag.components.utils import get_config -from tqdm import tqdm - -config = get_config() - -DRY_RUN = False - -data_path = Path(config['data']['data_path']) / 'pdf' -formats_to_add = ['Text', 'Tables'] -glob_pattern = '**/*.pdf' - -namespace = UUID('8c9040b0-b5cd-4d7c-bc2e-737da1b24ebf') -record_filename = uuid5(namespace, str(data_path)) # Unique file record name based on folder path - -records_dir = Path(config['data']['data_path']) / 'records' -records_dir.mkdir(parents=True, exist_ok=True) -record_file = records_dir / f'{record_filename}.txt' - - -def load_processed_files(): - # Load processed files from a file if it exists - if os.path.exists(record_file): - with open(record_file, 'r') as file_record: - processed_files.update(file_record.read().splitlines()) - - -def update_processed_file_record(file_path, dry_run=False): - # Update (append) the processed file record file - with open(record_file, 'a') as file: - if not dry_run: - file.write(file_path + '\n') - - -def add_file_to_database(file_path: Path, dry_run=False): - # Check if file_path is in the processed file set - if str(file_path) not in processed_files: - # Add file to the vector database - add_to_database(file_path, dry_run=dry_run) - # Add file_path to the processed file set - processed_files.add(str(file_path)) - # Update the processed file record file - update_processed_file_record(str(file_path), dry_run=dry_run) - return f'Completed adding - {file_path.relative_to(data_path)}' - else: - return f'Already exists - {file_path.relative_to(data_path)}' - - -def add_to_database(file_path, dry_run=False): - if not dry_run: - docs = parser.load_file(file_path) - for format_key in formats_to_add: - retriever.add_docs(docs[format_key]) - - -parser = ParsePDF() -retriever = Retriever() - -processed_files = set() -load_processed_files() # Load processed files into the set on script startup - - -def main(): - filepath_gen = data_path.glob(glob_pattern) - num_files = len(list(data_path.glob(glob_pattern))) - print(f'DATA PATH : {data_path}') - print(f'No of PDFs to add: {num_files}') - pbar = tqdm(filepath_gen, total=num_files, desc='Adding Files ') - for file in pbar: - pbar.set_postfix({'Current file': file.relative_to(data_path)}) - pbar.write(add_file_to_database(file, dry_run=DRY_RUN)) - # if str(file) not in processed_files: - # add_to_database(file, dry_run=DRY_RUN) # Add file to the vector database - # processed_files.add(str(file)) # Add file_path to processed set - # update_processed_file_record(str(file), dry_run=DRY_RUN) # Update the processed file record file - # pbar.write(f'Completed adding - {file.relative_to(data_path)}') - # else: - # pbar.write(f'Already exists - {file.relative_to(data_path)}') - - -if __name__ == "__main__": - main() diff --git a/projects/Basic-RAG/BasicRAG_CustomPrompt.py b/projects/Basic-RAG/BasicRAG_CustomPrompt.py new file mode 100644 index 0000000..4bf387b --- /dev/null +++ b/projects/Basic-RAG/BasicRAG_CustomPrompt.py @@ -0,0 +1,14 @@ +"""A cookbook demonstrating how to use custom prompts with Basic RAG.""" + +from grag.components.prompt import Prompt +from grag.rag.basic_rag import BasicRAG + +custom_prompt = Prompt( + input_keys={"context", "question"}, + template="""Answer the following question based on the given context. + question: {question} + context: {context} + answer: + """, +) +rag = BasicRAG(doc_chain="stuff", custom_prompt=custom_prompt) diff --git a/projects/Basic-RAG/BasicRAG_FewShotPrompt.py b/projects/Basic-RAG/BasicRAG_FewShotPrompt.py index cd9199c..a2804e6 100644 --- a/projects/Basic-RAG/BasicRAG_FewShotPrompt.py +++ b/projects/Basic-RAG/BasicRAG_FewShotPrompt.py @@ -1,29 +1,30 @@ +"""A cookbook demonstrating how to use custom few-shot prompts with Basic RAG.""" + +from grag.components.prompt import FewShotPrompt from grag.rag.basic_rag import BasicRAG -from grap.components.prompt import FewShotPrompt custom_few_shot_prompt = FewShotPrompt( input_keys={"context", "question"}, output_keys={"answer"}, - example_template=''' + example_template=""" question: {question} answer: {answer} - ''', - prefix='''Answer the following question based on the given context like examples given below:''', - suffix='''Answer the following question based on the given context + """, + prefix="""Answer the following question based on the given context like examples given below:""", + suffix="""Answer the following question based on the given context question: {question} context: {context} answer: - ''', + """, examples=[ { "question": "What is the name of largest planet?", - "answer": "Jupiter is the largest planet." + "answer": "Jupiter is the largest planet.", }, { "question": "Who came up with Convolutional Neural Networks?", - "answer": "Yann LeCun introduced convolutional neural networks." - } - ] + "answer": "Yann LeCun introduced convolutional neural networks.", + }, + ], ) -rag = BasicRAG(doc_chain="stuff", - custom_prompt=custom_few_shot_prompt) +rag = BasicRAG(doc_chain="stuff", custom_prompt=custom_few_shot_prompt) diff --git a/projects/Basic-RAG/BasicRAG_ingest.py b/projects/Basic-RAG/BasicRAG_ingest.py new file mode 100644 index 0000000..e7b38a0 --- /dev/null +++ b/projects/Basic-RAG/BasicRAG_ingest.py @@ -0,0 +1,13 @@ +"""A cookbook demonstrating how to ingest pdf files for use with Basic RAG.""" + +from pathlib import Path + +from grag.components.multivec_retriever import Retriever +from grag.components.vectordb.deeplake_client import DeepLakeClient + +client = DeepLakeClient(collection_name="test") +retriever = Retriever(vectordb=client) + +dir_path = Path(__file__).parents[2] / "data/client_test/test/" + +retriever.ingest(dir_path) diff --git a/projects/Basic-RAG/BasicRAG_refine.py b/projects/Basic-RAG/BasicRAG_refine.py index cce6398..1e83d41 100644 --- a/projects/Basic-RAG/BasicRAG_refine.py +++ b/projects/Basic-RAG/BasicRAG_refine.py @@ -1,5 +1,11 @@ -from grag.grag.rag import BasicRAG +"""A cookbook demonstrating how to use Basic RAG with refine chain using DeepLake as client.""" +from grag.components.multivec_retriever import Retriever +from grag.components.vectordb.deeplake_client import DeepLakeClient +from grag.rag.basic_rag import BasicRAG + +client = DeepLakeClient(collection_name="test") +retriever = Retriever(vectordb=client) rag = BasicRAG(doc_chain="refine") if __name__ == "__main__": diff --git a/projects/Basic-RAG/BasicRAG_stuff.py b/projects/Basic-RAG/BasicRAG_stuff.py index 63edeab..da95ec6 100644 --- a/projects/Basic-RAG/BasicRAG_stuff.py +++ b/projects/Basic-RAG/BasicRAG_stuff.py @@ -1,3 +1,5 @@ +"""A cookbook demonstrating how to use Basic RAG with stuff chain using DeepLake as client.""" + from grag.components.multivec_retriever import Retriever from grag.components.vectordb.deeplake_client import DeepLakeClient from grag.rag.basic_rag import BasicRAG diff --git a/projects/Basic-RAG/BasicRAG_v1_depricated.py b/projects/Basic-RAG/BasicRAG_v1_depricated.py deleted file mode 100644 index 63b5b89..0000000 --- a/projects/Basic-RAG/BasicRAG_v1_depricated.py +++ /dev/null @@ -1,42 +0,0 @@ -from pathlib import Path - -from src.components.llm import LLM, llm_conf -from src.components.multivec_retriever import Retriever -from src.components.utils import stuff_docs, load_prompt - -# from prompts import -''' -Basic RAG v1 - stuff, chunks - Given a query, retrieve similar chunks from vector database. Concat them into a single string, called context. - Using the prompt template, call llm. Return chunk sources. -''' - -llm_ = LLM() -# llm = llm_.load_model(pipeline='hf', is_local=False) -llm = llm_.load_model() - -retriever = Retriever(top_k=3) - -prompt_name = 'Llama-2_QA_1.json' -prompt_path = Path(__file__).parent / 'prompts' / prompt_name -prompt_template = load_prompt(prompt_path) - - -def call_rag(query): - retrieved_docs = retriever.get_chunk(query) - context = stuff_docs(retrieved_docs) - prompt = prompt_template.format(context=context, question=query) - response = llm.invoke(prompt) - sources = [doc.metadata["source"] for doc in retrieved_docs] - return response, sources - - -if __name__ == "__main__": - while True: - query = input("Query:") - response, sources = call_rag(query) - if not llm_conf['std_out']: - print(response) - print(f'Sources: ') - for index, source in enumerate(sources): - print(f'\t{index}: {source}') diff --git a/projects/Basic-RAG/BasicRAG_v2_depricated.py b/projects/Basic-RAG/BasicRAG_v2_depricated.py deleted file mode 100644 index fd2e250..0000000 --- a/projects/Basic-RAG/BasicRAG_v2_depricated.py +++ /dev/null @@ -1,52 +0,0 @@ -from pathlib import Path - -from src.components.llm import LLM, llm_conf -from src.components.multivec_retriever import Retriever -from src.components.utils import load_prompt - - -''' -Basic RAG v2 - refine, top_k - -''' - -llm_ = LLM() -llm = llm_.load_model() - -retriever = Retriever(top_k=3) - -prompts_path = Path(__file__).parent / 'prompts' -prompt_name_question = 'Llama-2_QA_1.json' -prompt_template_question = load_prompt(prompts_path / prompt_name_question) -prompt_name_refine = 'Llama-2_QA-refine_1.json' -prompt_template_refine = load_prompt(prompts_path / prompt_name_refine) - - -def call_rag(query): - retrieved_docs = retriever.get_chunk(query) - sources = [doc.metadata["source"] for doc in retrieved_docs] - responses = [] - for index, doc in enumerate(retrieved_docs): - if index == 0: - prompt = prompt_template_question.format(context=doc.page_content, - question=query) - response = llm.invoke(prompt) - responses.append(response) - else: - prompt = prompt_template_refine.format(context=doc.page_content, - question=query, - existing_answer=responses[-1]) - response = llm.invoke(prompt) - responses.append(response) - return responses, sources - - -if __name__ == "__main__": - while True: - query = input("Query:") - responses, sources = call_rag(query) - if not llm_conf['std_out']: - print(responses[-1]) - print(f'Sources: ') - for index, source in enumerate(sources): - print(f'\t{index}: {source}') diff --git a/projects/Basic-RAG/README.md b/projects/Basic-RAG/README.md index 8e04a67..dd8f74c 100644 --- a/projects/Basic-RAG/README.md +++ b/projects/Basic-RAG/README.md @@ -1,11 +1,14 @@ -# Basic RAG +# Basic RAG Cookbook -## BasicRAG v1 +Welcome to the Basic RAG Cookbook! This repository is dedicated to showcasing how to utilize the Retrieval-Augmented +Generation (RAG) model for various applications using custom and few-shot prompts. For in depth understanding of RAG +pipelines, chains, and prompts +check [RAG-Piplines.md](https://github.com/arjbingly/Capstone_5/blob/main/projects/Basic-RAG/RAG-Piplines.md) -- Stuff document chain (Refer [RAG-Pipelines.md](./RAG-Piplines.md)) -- Top-k retrival +### Contents: -## BasicRAG v2 - -- Refine document chain (Refer [RAG-Pipelines.md](./RAG-Piplines.md)) -- Top-k retrival +- **BasicRAG_CustomPrompt.py**: Learn to integrate custom prompts into Basic RAG for tailored query responses. +- **BasicRAG_FewShotPrompt.py**: Explore the use of few-shot prompts to enhance Basic RAG's contextual understanding. +- **BasicRAG_ingest.py**: Demonstrates the process of ingesting PDF files, making them searchable via Basic RAG. +- **BasicRAG_stuff.py**: A guide on leveraging the stuff chain with Basic RAG for enriched data processing. +- **BasicRAG_refine.py**: Discover how to refine queries using the refine chain for more precise results with Basic RAG. diff --git a/projects/Basic-RAG/tests/BasicRAG_v1_test.py b/projects/Basic-RAG/tests/BasicRAG_v1_test.py deleted file mode 100644 index 0237fd1..0000000 --- a/projects/Basic-RAG/tests/BasicRAG_v1_test.py +++ /dev/null @@ -1,12 +0,0 @@ -import os -from pathlib import Path -import sys - -# add Basic-RAG folder to sys path -sys.path.insert(1, str(Path(os.getcwd()).parents[0])) - -from BasicRAG_stuff import rag - -if __name__ == "__main__": - query = 'What types of dependencies does dependence analysis identify in loop programs?' - response, sources = rag(query) diff --git a/projects/Basic-RAG/tests/BasicRAG_v2_test.py b/projects/Basic-RAG/tests/BasicRAG_v2_test.py deleted file mode 100644 index c38d301..0000000 --- a/projects/Basic-RAG/tests/BasicRAG_v2_test.py +++ /dev/null @@ -1,12 +0,0 @@ -import os -from pathlib import Path -import sys - -# add Basic-RAG folder to sys path -sys.path.insert(1, str(Path(os.getcwd()).parents[0])) - -from BasicRAG_refine import rag - -if __name__ == "__main__": - query = 'What types of dependencies does dependence analysis identify in loop programs?' - responses, sources = rag(query) diff --git a/projects/Retriver-GUI/retriever_app.py b/projects/Retriver-GUI/retriever_app.py index 9f4198c..5f81213 100644 --- a/projects/Retriver-GUI/retriever_app.py +++ b/projects/Retriver-GUI/retriever_app.py @@ -1,3 +1,5 @@ +"""GUI for Retriever.""" + import os import sys from pathlib import Path @@ -10,42 +12,71 @@ class PageHome: + """Manages the home page interface and interactions in a web application. + + Attributes: + app: The application instance holding components like the retriever. + """ + def __init__(self, app): + """Initializes the PageHome with the application instance. + + Args: + app: The application instance. + """ self.app = app def render_sidebar(self): + """Renders the sidebar options for the application.""" with st.sidebar: - st.session_state.metadata_toggle = st.toggle('Show Metadata') - st.session_state.top_k = st.number_input('Show Top K', - min_value=0, - value=3, - step=1) + st.session_state.metadata_toggle = st.toggle("Show Metadata") + st.session_state.top_k = st.number_input( + "Show Top K", min_value=0, value=3, step=1 + ) def render_search_form(self): + """Renders the search form and returns the state of the search button.""" st.markdown("Enter query") with st.form("search_form"): - st.session_state.query = st.text_input("Query:", value='What is Artificial Intelligence?') + st.session_state.query = st.text_input( + "Query:", value="What is Artificial Intelligence?" + ) return st.form_submit_button("Search") def get_search_results(self, _query, _top_k): - return self.app.retriever.get_chunk(_query, - top_k=_top_k, - with_score=True) + """Retrieves search results based on the query and top_k parameter. + + Args: + _query: The search query. + _top_k: The number of top results to retrieve. + + Returns: + A list of search results with scores. + """ + return self.app.retriever.get_chunk(_query, top_k=_top_k, with_score=True) def render_search_results(self): + """Displays the search results.""" with st.spinner("Searching for similar chunks with :" + st.session_state.query): - results = self.get_search_results(st.session_state.query, st.session_state.top_k) + results = self.get_search_results( + st.session_state.query, st.session_state.top_k + ) has_results = len(results) != 0 if not has_results: return st.markdown("Could not find anything similar.") # st.write(results) for i, (result, score) in enumerate(results): - with st.expander(f':bulb:**{i}** - Similiarity Score: {score:.3f}'): + with st.expander(f":bulb:**{i}** - Similiarity Score: {score:.3f}"): st.write(result.page_content) if st.session_state.metadata_toggle: st.write(result.metadata) def check_connection(self): + """Checks the connection to the search backend. + + Returns: + True if the connection is active, False otherwise. + """ response = self.app.retriever.vectordb.test_connection() if response: return True @@ -53,24 +84,26 @@ def check_connection(self): return False def render_stats(self): - st.write(f''' + """Renders statistics and details about the search backend.""" + st.write(f""" **Chroma Client Details:** \n Host Address : {self.app.retriever.vectordb.host}:{self.app.retriever.vectordb.port} \n Collection Name : {self.app.retriever.vectordb.collection_name} \n Embeddings Type : {self.app.retriever.vectordb.embedding_type} \n Embeddings Model: {self.app.retriever.vectordb.embedding_model} \n Number of docs : {self.app.retriever.vectordb.collection.count()} \n - ''') - if st.button('Check Connection'): + """) + if st.button("Check Connection"): response = self.app.retriever.vectordb.test_connection() if response: - st.write(':green[Connection Active]') + st.write(":green[Connection Active]") else: - st.write(':red[Connection Lost]') + st.write(":red[Connection Lost]") def render(self): + """Main rendering function for the home page, orchestrating the UI components.""" self.render_sidebar() - tab1, tab2 = st.tabs(['Search', 'Details']) + tab1, tab2 = st.tabs(["Search", "Details"]) with tab1: submitted = self.render_search_form() if submitted: @@ -80,12 +113,18 @@ def render(self): class App: + """Represents the main application for the Retriever system. + + This class initializes the application and sets up the main interface. + """ def __init__(self): + """Initializes the application with a Retriever instance.""" self.retriever = Retriever() def render(self): - st.title('Retriever App') + """Renders the application title and the home page interface.""" + st.title("Retriever App") PageHome(self).render() diff --git a/src/config.ini b/src/config.ini index c2938f9..e23c1b5 100644 --- a/src/config.ini +++ b/src/config.ini @@ -14,12 +14,6 @@ n_gpu_layers_cpp : -1 std_out : True base_dir : ${root:root_path}/models -[deeplake] -collection_name : arxiv -embedding_type : instructor-embedding -embedding_model : hkunlp/instructor-xl -store_path : ${data:data_path}/vectordb - [chroma] host : localhost port : 8000 diff --git a/src/grag/components/multivec_retriever.py b/src/grag/components/multivec_retriever.py index 9fd8664..05478df 100644 --- a/src/grag/components/multivec_retriever.py +++ b/src/grag/components/multivec_retriever.py @@ -6,8 +6,10 @@ import asyncio import uuid -from typing import Any, Dict, List, Optional +from pathlib import Path +from typing import Any, Dict, List, Optional, Union +from grag.components.parse_pdf import ParsePDF from grag.components.text_splitter import TextSplitter from grag.components.utils import get_config from grag.components.vectordb.base import VectorDB @@ -15,6 +17,8 @@ from langchain.retrievers.multi_vector import MultiVectorRetriever from langchain.storage import LocalFileStore from langchain_core.documents import Document +from tqdm import tqdm +from tqdm.asyncio import tqdm as atqdm multivec_retriever_conf = get_config()["multivec_retriever"] @@ -31,7 +35,8 @@ class Retriever: id_key: A key prefix for identifying documents vectordb: ChromaClient class instance from components.client store: langchain.storage.LocalFileStore object, stores the key value pairs of document id and parent file - retriever: langchain.retrievers.multi_vector.MultiVectorRetriever class instance, langchain's multi-vector retriever + retriever: langchain.retrievers.multi_vector.MultiVectorRetriever class instance, + langchain's multi-vector retriever splitter: TextSplitter class instance from components.text_splitter namespace: Namespace for producing unique id top_k: Number of top chunks to return from similarity search. @@ -50,10 +55,12 @@ def __init__( """Initialize the Retriever. Args: + vectordb: Vector DB client instance store_path: Path to the local file store, defaults to argument from config file id_key: A key prefix for identifying documents, defaults to argument from config file namespace: A namespace for producing unique id, defaults to argument from congig file top_k: Number of top chunks to return from similarity search, defaults to 1 + client_kwargs: kwargs to pass to the vectordb client """ self.store_path = store_path self.id_key = id_key @@ -227,3 +234,87 @@ def get_docs_from_chunks(self, chunks: List[Document], one_to_one=False): ids.append(d.metadata[self.id_key]) docs = self.retriever.docstore.mget(ids) return [d for d in docs if d is not None] + + def ingest( + self, + dir_path: Union[str, Path], + glob_pattern: str = "**/*.pdf", + dry_run: bool = False, + verbose: bool = True, + parser_kwargs: dict = None, + ): + """Ingests the files in directory. + + Args: + dir_path: path to the directory + glob_pattern: glob pattern to identify files + dry_run: if True, does not ingest any files + verbose: if True, shows progress + parser_kwargs: arguments to pass to the parser + + """ + _formats_to_add = ["Text", "Tables"] + filepath_gen = Path(dir_path).glob(glob_pattern) + if parser_kwargs: + parser = ParsePDF(parser_kwargs) + else: + parser = ParsePDF() + if verbose: + num_files = len(list(Path(dir_path).glob(glob_pattern))) + pbar = tqdm(filepath_gen, total=num_files, desc="Ingesting Files") + for filepath in pbar: + if not dry_run: + pbar.set_postfix_str( + f"Parsing file - {filepath.relative_to(dir_path)}" + ) + docs = parser.load_file(filepath) + pbar.set_postfix_str( + f"Adding file - {filepath.relative_to(dir_path)}" + ) + for format_key in _formats_to_add: + self.add_docs(docs[format_key]) + print(f"Completed adding - {filepath.relative_to(dir_path)}") + else: + print(f"DRY RUN: found - {filepath.relative_to(dir_path)}") + + async def aingest( + self, + dir_path: Union[str, Path], + glob_pattern: str = "**/*.pdf", + dry_run: bool = False, + verbose: bool = True, + parser_kwargs: dict = None, + ): + """Asynchronously ingests the files in directory. + + Args: + dir_path: path to the directory + glob_pattern: glob pattern to identify files + dry_run: if True, does not ingest any files + verbose: if True, shows progress + parser_kwargs: arguments to pass to the parser + + """ + _formats_to_add = ["Text", "Tables"] + filepath_gen = Path(dir_path).glob(glob_pattern) + if parser_kwargs: + parser = ParsePDF(parser_kwargs) + else: + parser = ParsePDF() + if verbose: + num_files = len(list(Path(dir_path).glob(glob_pattern))) + pbar = atqdm(filepath_gen, total=num_files, desc="Ingesting Files") + for filepath in pbar: + if not dry_run: + pbar.set_postfix_str( + f"Parsing file - {filepath.relative_to(dir_path)}" + ) + docs = parser.load_file(filepath) + pbar.set_postfix_str( + f"Adding file - {filepath.relative_to(dir_path)}" + ) + for format_key in _formats_to_add: + await self.aadd_docs(docs[format_key]) + print(f"Completed adding - {filepath.relative_to(dir_path)}") + else: + print(f"DRY RUN: found - {filepath.relative_to(dir_path)}") diff --git a/src/grag/rag/basic_rag.py b/src/grag/rag/basic_rag.py index da461b6..1b45d9a 100644 --- a/src/grag/rag/basic_rag.py +++ b/src/grag/rag/basic_rag.py @@ -40,6 +40,7 @@ def __init__( retriever_kwargs=None, custom_prompt: Union[Prompt, FewShotPrompt, None] = None, ): + """Initialize BasicRAG.""" if retriever is None: if retriever_kwargs is None: self.retriever = Retriever()