From 682d70811321a7bf5fe2a5543c930236dcdc3874 Mon Sep 17 00:00:00 2001
From: Arjun Bingly <arjunbin@gmail.com>
Date: Mon, 25 Mar 2024 17:29:54 -0400
Subject: [PATCH] BasicRAG cookbooks

---
 projects/Basic-RAG/BasciRAG_CustomPrompt.py  |  2 +-
 projects/Basic-RAG/BasicRAG-ingest_data.py   | 87 --------------------
 projects/Basic-RAG/BasicRAG_FewShotPrompt.py |  2 +-
 projects/Basic-RAG/BasicRAG_refine.py        |  6 +-
 projects/Basic-RAG/BasicRAG_v1_depricated.py | 42 ----------
 projects/Basic-RAG/BasicRAG_v2_depricated.py | 52 ------------
 6 files changed, 7 insertions(+), 184 deletions(-)
 delete mode 100644 projects/Basic-RAG/BasicRAG-ingest_data.py
 delete mode 100644 projects/Basic-RAG/BasicRAG_v1_depricated.py
 delete mode 100644 projects/Basic-RAG/BasicRAG_v2_depricated.py

diff --git a/projects/Basic-RAG/BasciRAG_CustomPrompt.py b/projects/Basic-RAG/BasciRAG_CustomPrompt.py
index 4222aad..3a3eed9 100644
--- a/projects/Basic-RAG/BasciRAG_CustomPrompt.py
+++ b/projects/Basic-RAG/BasciRAG_CustomPrompt.py
@@ -1,5 +1,5 @@
+from grag.components.prompt import Prompt
 from grag.rag.basic_rag import BasicRAG
-from grap.components.prompt import Prompt
 
 custom_prompt = Prompt(
     input_keys={"context", "question"},
diff --git a/projects/Basic-RAG/BasicRAG-ingest_data.py b/projects/Basic-RAG/BasicRAG-ingest_data.py
deleted file mode 100644
index 5d2417b..0000000
--- a/projects/Basic-RAG/BasicRAG-ingest_data.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import os
-from pathlib import Path
-from uuid import UUID, uuid5
-
-from grag.components.multivec_retriever import Retriever
-from grag.components.parse_pdf import ParsePDF
-from grag.components.utils import get_config
-from tqdm import tqdm
-
-config = get_config()
-
-DRY_RUN = False
-
-data_path = Path(config['data']['data_path']) / 'pdf'
-formats_to_add = ['Text', 'Tables']
-glob_pattern = '**/*.pdf'
-
-namespace = UUID('8c9040b0-b5cd-4d7c-bc2e-737da1b24ebf')
-record_filename = uuid5(namespace, str(data_path))  # Unique file record name based on folder path
-
-records_dir = Path(config['data']['data_path']) / 'records'
-records_dir.mkdir(parents=True, exist_ok=True)
-record_file = records_dir / f'{record_filename}.txt'
-
-
-def load_processed_files():
-    # Load processed files from a file if it exists
-    if os.path.exists(record_file):
-        with open(record_file, 'r') as file_record:
-            processed_files.update(file_record.read().splitlines())
-
-
-def update_processed_file_record(file_path, dry_run=False):
-    # Update (append) the processed file record file
-    with open(record_file, 'a') as file:
-        if not dry_run:
-            file.write(file_path + '\n')
-
-
-def add_file_to_database(file_path: Path, dry_run=False):
-    # Check if file_path is in the processed file set
-    if str(file_path) not in processed_files:
-        # Add file to the vector database
-        add_to_database(file_path, dry_run=dry_run)
-        # Add file_path to the processed file set
-        processed_files.add(str(file_path))
-        # Update the processed file record file
-        update_processed_file_record(str(file_path), dry_run=dry_run)
-        return f'Completed adding - {file_path.relative_to(data_path)}'
-    else:
-        return f'Already exists - {file_path.relative_to(data_path)}'
-
-
-def add_to_database(file_path, dry_run=False):
-    if not dry_run:
-        docs = parser.load_file(file_path)
-        for format_key in formats_to_add:
-            retriever.add_docs(docs[format_key])
-
-
-parser = ParsePDF()
-retriever = Retriever()
-
-processed_files = set()
-load_processed_files()  # Load processed files into the set on script startup
-
-
-def main():
-    filepath_gen = data_path.glob(glob_pattern)
-    num_files = len(list(data_path.glob(glob_pattern)))
-    print(f'DATA PATH : {data_path}')
-    print(f'No of PDFs to add: {num_files}')
-    pbar = tqdm(filepath_gen, total=num_files, desc='Adding Files ')
-    for file in pbar:
-        pbar.set_postfix({'Current file': file.relative_to(data_path)})
-        pbar.write(add_file_to_database(file, dry_run=DRY_RUN))
-        # if str(file) not in processed_files:
-        #     add_to_database(file, dry_run=DRY_RUN)  # Add file to the vector database
-        #     processed_files.add(str(file))  # Add file_path to processed set
-        #     update_processed_file_record(str(file), dry_run=DRY_RUN)  # Update the processed file record file
-        #     pbar.write(f'Completed adding - {file.relative_to(data_path)}')
-        # else:
-        #     pbar.write(f'Already exists - {file.relative_to(data_path)}')
-
-
-if __name__ == "__main__":
-    main()
diff --git a/projects/Basic-RAG/BasicRAG_FewShotPrompt.py b/projects/Basic-RAG/BasicRAG_FewShotPrompt.py
index cd9199c..5fc2d46 100644
--- a/projects/Basic-RAG/BasicRAG_FewShotPrompt.py
+++ b/projects/Basic-RAG/BasicRAG_FewShotPrompt.py
@@ -1,5 +1,5 @@
+from grag.components.prompt import FewShotPrompt
 from grag.rag.basic_rag import BasicRAG
-from grap.components.prompt import FewShotPrompt
 
 custom_few_shot_prompt = FewShotPrompt(
     input_keys={"context", "question"},
diff --git a/projects/Basic-RAG/BasicRAG_refine.py b/projects/Basic-RAG/BasicRAG_refine.py
index cce6398..ad8d8e7 100644
--- a/projects/Basic-RAG/BasicRAG_refine.py
+++ b/projects/Basic-RAG/BasicRAG_refine.py
@@ -1,5 +1,9 @@
-from grag.grag.rag import BasicRAG
+from grag.components.multivec_retriever import Retriever
+from grag.components.vectordb.deeplake_client import DeepLakeClient
+from grag.rag.basic_rag import BasicRAG
 
+client = DeepLakeClient(collection_name="test")
+retriever = Retriever(vectordb=client)
 rag = BasicRAG(doc_chain="refine")
 
 if __name__ == "__main__":
diff --git a/projects/Basic-RAG/BasicRAG_v1_depricated.py b/projects/Basic-RAG/BasicRAG_v1_depricated.py
deleted file mode 100644
index 63b5b89..0000000
--- a/projects/Basic-RAG/BasicRAG_v1_depricated.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from pathlib import Path
-
-from src.components.llm import LLM, llm_conf
-from src.components.multivec_retriever import Retriever
-from src.components.utils import stuff_docs, load_prompt
-
-# from prompts import
-'''
-Basic RAG v1 - stuff, chunks
-    Given a query, retrieve similar chunks from vector database. Concat them into a single string, called context.
-    Using the prompt template, call llm. Return chunk sources.
-'''
-
-llm_ = LLM()
-# llm = llm_.load_model(pipeline='hf', is_local=False)
-llm = llm_.load_model()
-
-retriever = Retriever(top_k=3)
-
-prompt_name = 'Llama-2_QA_1.json'
-prompt_path = Path(__file__).parent / 'prompts' / prompt_name
-prompt_template = load_prompt(prompt_path)
-
-
-def call_rag(query):
-    retrieved_docs = retriever.get_chunk(query)
-    context = stuff_docs(retrieved_docs)
-    prompt = prompt_template.format(context=context, question=query)
-    response = llm.invoke(prompt)
-    sources = [doc.metadata["source"] for doc in retrieved_docs]
-    return response, sources
-
-
-if __name__ == "__main__":
-    while True:
-        query = input("Query:")
-        response, sources = call_rag(query)
-        if not llm_conf['std_out']:
-            print(response)
-        print(f'Sources: ')
-        for index, source in enumerate(sources):
-            print(f'\t{index}: {source}')
diff --git a/projects/Basic-RAG/BasicRAG_v2_depricated.py b/projects/Basic-RAG/BasicRAG_v2_depricated.py
deleted file mode 100644
index fd2e250..0000000
--- a/projects/Basic-RAG/BasicRAG_v2_depricated.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from pathlib import Path
-
-from src.components.llm import LLM, llm_conf
-from src.components.multivec_retriever import Retriever
-from src.components.utils import load_prompt
-
-
-'''
-Basic RAG v2 - refine, top_k
-
-'''
-
-llm_ = LLM()
-llm = llm_.load_model()
-
-retriever = Retriever(top_k=3)
-
-prompts_path = Path(__file__).parent / 'prompts'
-prompt_name_question = 'Llama-2_QA_1.json'
-prompt_template_question = load_prompt(prompts_path / prompt_name_question)
-prompt_name_refine = 'Llama-2_QA-refine_1.json'
-prompt_template_refine = load_prompt(prompts_path / prompt_name_refine)
-
-
-def call_rag(query):
-    retrieved_docs = retriever.get_chunk(query)
-    sources = [doc.metadata["source"] for doc in retrieved_docs]
-    responses = []
-    for index, doc in enumerate(retrieved_docs):
-        if index == 0:
-            prompt = prompt_template_question.format(context=doc.page_content,
-                                                     question=query)
-            response = llm.invoke(prompt)
-            responses.append(response)
-        else:
-            prompt = prompt_template_refine.format(context=doc.page_content,
-                                                   question=query,
-                                                   existing_answer=responses[-1])
-            response = llm.invoke(prompt)
-            responses.append(response)
-    return responses, sources
-
-
-if __name__ == "__main__":
-    while True:
-        query = input("Query:")
-        responses, sources = call_rag(query)
-        if not llm_conf['std_out']:
-            print(responses[-1])
-        print(f'Sources: ')
-        for index, source in enumerate(sources):
-            print(f'\t{index}: {source}')