From 682d70811321a7bf5fe2a5543c930236dcdc3874 Mon Sep 17 00:00:00 2001 From: Arjun Bingly Date: Mon, 25 Mar 2024 17:29:54 -0400 Subject: [PATCH] BasicRAG cookbooks --- projects/Basic-RAG/BasciRAG_CustomPrompt.py | 2 +- projects/Basic-RAG/BasicRAG-ingest_data.py | 87 -------------------- projects/Basic-RAG/BasicRAG_FewShotPrompt.py | 2 +- projects/Basic-RAG/BasicRAG_refine.py | 6 +- projects/Basic-RAG/BasicRAG_v1_depricated.py | 42 ---------- projects/Basic-RAG/BasicRAG_v2_depricated.py | 52 ------------ 6 files changed, 7 insertions(+), 184 deletions(-) delete mode 100644 projects/Basic-RAG/BasicRAG-ingest_data.py delete mode 100644 projects/Basic-RAG/BasicRAG_v1_depricated.py delete mode 100644 projects/Basic-RAG/BasicRAG_v2_depricated.py diff --git a/projects/Basic-RAG/BasciRAG_CustomPrompt.py b/projects/Basic-RAG/BasciRAG_CustomPrompt.py index 4222aad..3a3eed9 100644 --- a/projects/Basic-RAG/BasciRAG_CustomPrompt.py +++ b/projects/Basic-RAG/BasciRAG_CustomPrompt.py @@ -1,5 +1,5 @@ +from grag.components.prompt import Prompt from grag.rag.basic_rag import BasicRAG -from grap.components.prompt import Prompt custom_prompt = Prompt( input_keys={"context", "question"}, diff --git a/projects/Basic-RAG/BasicRAG-ingest_data.py b/projects/Basic-RAG/BasicRAG-ingest_data.py deleted file mode 100644 index 5d2417b..0000000 --- a/projects/Basic-RAG/BasicRAG-ingest_data.py +++ /dev/null @@ -1,87 +0,0 @@ -import os -from pathlib import Path -from uuid import UUID, uuid5 - -from grag.components.multivec_retriever import Retriever -from grag.components.parse_pdf import ParsePDF -from grag.components.utils import get_config -from tqdm import tqdm - -config = get_config() - -DRY_RUN = False - -data_path = Path(config['data']['data_path']) / 'pdf' -formats_to_add = ['Text', 'Tables'] -glob_pattern = '**/*.pdf' - -namespace = UUID('8c9040b0-b5cd-4d7c-bc2e-737da1b24ebf') -record_filename = uuid5(namespace, str(data_path)) # Unique file record name based on folder path - -records_dir = Path(config['data']['data_path']) / 'records' -records_dir.mkdir(parents=True, exist_ok=True) -record_file = records_dir / f'{record_filename}.txt' - - -def load_processed_files(): - # Load processed files from a file if it exists - if os.path.exists(record_file): - with open(record_file, 'r') as file_record: - processed_files.update(file_record.read().splitlines()) - - -def update_processed_file_record(file_path, dry_run=False): - # Update (append) the processed file record file - with open(record_file, 'a') as file: - if not dry_run: - file.write(file_path + '\n') - - -def add_file_to_database(file_path: Path, dry_run=False): - # Check if file_path is in the processed file set - if str(file_path) not in processed_files: - # Add file to the vector database - add_to_database(file_path, dry_run=dry_run) - # Add file_path to the processed file set - processed_files.add(str(file_path)) - # Update the processed file record file - update_processed_file_record(str(file_path), dry_run=dry_run) - return f'Completed adding - {file_path.relative_to(data_path)}' - else: - return f'Already exists - {file_path.relative_to(data_path)}' - - -def add_to_database(file_path, dry_run=False): - if not dry_run: - docs = parser.load_file(file_path) - for format_key in formats_to_add: - retriever.add_docs(docs[format_key]) - - -parser = ParsePDF() -retriever = Retriever() - -processed_files = set() -load_processed_files() # Load processed files into the set on script startup - - -def main(): - filepath_gen = data_path.glob(glob_pattern) - num_files = len(list(data_path.glob(glob_pattern))) - print(f'DATA PATH : {data_path}') - print(f'No of PDFs to add: {num_files}') - pbar = tqdm(filepath_gen, total=num_files, desc='Adding Files ') - for file in pbar: - pbar.set_postfix({'Current file': file.relative_to(data_path)}) - pbar.write(add_file_to_database(file, dry_run=DRY_RUN)) - # if str(file) not in processed_files: - # add_to_database(file, dry_run=DRY_RUN) # Add file to the vector database - # processed_files.add(str(file)) # Add file_path to processed set - # update_processed_file_record(str(file), dry_run=DRY_RUN) # Update the processed file record file - # pbar.write(f'Completed adding - {file.relative_to(data_path)}') - # else: - # pbar.write(f'Already exists - {file.relative_to(data_path)}') - - -if __name__ == "__main__": - main() diff --git a/projects/Basic-RAG/BasicRAG_FewShotPrompt.py b/projects/Basic-RAG/BasicRAG_FewShotPrompt.py index cd9199c..5fc2d46 100644 --- a/projects/Basic-RAG/BasicRAG_FewShotPrompt.py +++ b/projects/Basic-RAG/BasicRAG_FewShotPrompt.py @@ -1,5 +1,5 @@ +from grag.components.prompt import FewShotPrompt from grag.rag.basic_rag import BasicRAG -from grap.components.prompt import FewShotPrompt custom_few_shot_prompt = FewShotPrompt( input_keys={"context", "question"}, diff --git a/projects/Basic-RAG/BasicRAG_refine.py b/projects/Basic-RAG/BasicRAG_refine.py index cce6398..ad8d8e7 100644 --- a/projects/Basic-RAG/BasicRAG_refine.py +++ b/projects/Basic-RAG/BasicRAG_refine.py @@ -1,5 +1,9 @@ -from grag.grag.rag import BasicRAG +from grag.components.multivec_retriever import Retriever +from grag.components.vectordb.deeplake_client import DeepLakeClient +from grag.rag.basic_rag import BasicRAG +client = DeepLakeClient(collection_name="test") +retriever = Retriever(vectordb=client) rag = BasicRAG(doc_chain="refine") if __name__ == "__main__": diff --git a/projects/Basic-RAG/BasicRAG_v1_depricated.py b/projects/Basic-RAG/BasicRAG_v1_depricated.py deleted file mode 100644 index 63b5b89..0000000 --- a/projects/Basic-RAG/BasicRAG_v1_depricated.py +++ /dev/null @@ -1,42 +0,0 @@ -from pathlib import Path - -from src.components.llm import LLM, llm_conf -from src.components.multivec_retriever import Retriever -from src.components.utils import stuff_docs, load_prompt - -# from prompts import -''' -Basic RAG v1 - stuff, chunks - Given a query, retrieve similar chunks from vector database. Concat them into a single string, called context. - Using the prompt template, call llm. Return chunk sources. -''' - -llm_ = LLM() -# llm = llm_.load_model(pipeline='hf', is_local=False) -llm = llm_.load_model() - -retriever = Retriever(top_k=3) - -prompt_name = 'Llama-2_QA_1.json' -prompt_path = Path(__file__).parent / 'prompts' / prompt_name -prompt_template = load_prompt(prompt_path) - - -def call_rag(query): - retrieved_docs = retriever.get_chunk(query) - context = stuff_docs(retrieved_docs) - prompt = prompt_template.format(context=context, question=query) - response = llm.invoke(prompt) - sources = [doc.metadata["source"] for doc in retrieved_docs] - return response, sources - - -if __name__ == "__main__": - while True: - query = input("Query:") - response, sources = call_rag(query) - if not llm_conf['std_out']: - print(response) - print(f'Sources: ') - for index, source in enumerate(sources): - print(f'\t{index}: {source}') diff --git a/projects/Basic-RAG/BasicRAG_v2_depricated.py b/projects/Basic-RAG/BasicRAG_v2_depricated.py deleted file mode 100644 index fd2e250..0000000 --- a/projects/Basic-RAG/BasicRAG_v2_depricated.py +++ /dev/null @@ -1,52 +0,0 @@ -from pathlib import Path - -from src.components.llm import LLM, llm_conf -from src.components.multivec_retriever import Retriever -from src.components.utils import load_prompt - - -''' -Basic RAG v2 - refine, top_k - -''' - -llm_ = LLM() -llm = llm_.load_model() - -retriever = Retriever(top_k=3) - -prompts_path = Path(__file__).parent / 'prompts' -prompt_name_question = 'Llama-2_QA_1.json' -prompt_template_question = load_prompt(prompts_path / prompt_name_question) -prompt_name_refine = 'Llama-2_QA-refine_1.json' -prompt_template_refine = load_prompt(prompts_path / prompt_name_refine) - - -def call_rag(query): - retrieved_docs = retriever.get_chunk(query) - sources = [doc.metadata["source"] for doc in retrieved_docs] - responses = [] - for index, doc in enumerate(retrieved_docs): - if index == 0: - prompt = prompt_template_question.format(context=doc.page_content, - question=query) - response = llm.invoke(prompt) - responses.append(response) - else: - prompt = prompt_template_refine.format(context=doc.page_content, - question=query, - existing_answer=responses[-1]) - response = llm.invoke(prompt) - responses.append(response) - return responses, sources - - -if __name__ == "__main__": - while True: - query = input("Query:") - responses, sources = call_rag(query) - if not llm_conf['std_out']: - print(responses[-1]) - print(f'Sources: ') - for index, source in enumerate(sources): - print(f'\t{index}: {source}')