From 515a672bfb197b072eb8dc13f17777237c6a2b3d Mon Sep 17 00:00:00 2001
From: Arjun Bingly <arjunbin@gmail.com>
Date: Tue, 30 Apr 2024 20:25:16 -0400
Subject: [PATCH] Ruff format patch

---
 cookbook/RAG-GUI/app.py                       |  3 +-
 src/grag/components/create_config.py          | 14 +++---
 src/grag/components/llm.py                    | 15 +++---
 src/grag/components/multivec_retriever.py     | 50 ++++++++++---------
 src/grag/components/parse_pdf.py              | 28 +++++------
 src/grag/components/prompt.py                 |  2 +-
 src/grag/components/text_splitter.py          |  4 +-
 src/grag/components/utils.py                  |  8 +--
 src/grag/components/vectordb/base.py          |  4 +-
 src/grag/components/vectordb/chroma_client.py | 28 +++++------
 .../components/vectordb/deeplake_client.py    | 28 +++++------
 src/grag/rag/basic_rag.py                     |  4 +-
 12 files changed, 95 insertions(+), 93 deletions(-)

diff --git a/cookbook/RAG-GUI/app.py b/cookbook/RAG-GUI/app.py
index b2f3085..588da45 100644
--- a/cookbook/RAG-GUI/app.py
+++ b/cookbook/RAG-GUI/app.py
@@ -1,5 +1,4 @@
-"""
-RAG-GUI
+"""RAG-GUI
 =======
 
 A cookbook demonstrating how to run a RAG app on streamlit.
diff --git a/src/grag/components/create_config.py b/src/grag/components/create_config.py
index 9708a18..20bf943 100644
--- a/src/grag/components/create_config.py
+++ b/src/grag/components/create_config.py
@@ -8,7 +8,7 @@
 from importlib_resources import files
 
 
-def create_config(path: Union[str, Path] = '.') -> None:
+def create_config(path: Union[str, Path] = ".") -> None:
     """Create a configuration file if it doesn't exist.
 
     This function checks for the existence of a 'config.ini' file at the given path.
@@ -17,8 +17,8 @@ def create_config(path: Union[str, Path] = '.') -> None:
     and does not overwrite the existing file.
 
     Args:
-        path (Union[str, Path]): The directory path where the 'config.ini' should be 
-                                 located. If not specified, defaults to the current 
+        path (Union[str, Path]): The directory path where the 'config.ini' should be
+                                 located. If not specified, defaults to the current
                                  directory ('.').
 
     Returns:
@@ -29,15 +29,15 @@ def create_config(path: Union[str, Path] = '.') -> None:
         PermissionError: If the process does not have permission to write to the specified
                          directory.
     """
-    default_config_path = files(grag.resources).joinpath('default_config.ini')
-    path = Path(path) / 'config.ini'
+    default_config_path = files(grag.resources).joinpath("default_config.ini")
+    path = Path(path) / "config.ini"
     path = path.resolve()
     if path.exists():
-        print('Config file already exists')
+        print("Config file already exists")
     else:
         shutil.copyfile(default_config_path, path, follow_symlinks=True)
         print(f"Created config file at {path}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     create_config()
diff --git a/src/grag/components/llm.py b/src/grag/components/llm.py
index 3560387..f7ede73 100644
--- a/src/grag/components/llm.py
+++ b/src/grag/components/llm.py
@@ -47,15 +47,15 @@ def __init__(
         model_name: str,
         quantization: str,
         pipeline: str,
-        device_map: str = 'auto',
-        task: str = 'text-generation',
-        max_new_tokens: str = '1024',
+        device_map: str = "auto",
+        task: str = "text-generation",
+        max_new_tokens: str = "1024",
         temperature: Union[str, int] = 0.1,
         n_batch: Union[str, int] = 1024,
         n_ctx: Union[str, int] = 6000,
         n_gpu_layers: Union[str, int] = -1,
         std_out: Union[bool, str] = True,
-        base_dir: Union[str, Path] = Path('models'),
+        base_dir: Union[str, Path] = Path("models"),
         callbacks=None,
     ):
         """Initialize the LLM class using the given parameters.
@@ -184,8 +184,11 @@ def llama_cpp(self):
         return llm
 
     def load_model(
-        self, model_name: Optional[str] = None, pipeline: Optional[str] = None, quantization: Optional[str] = None,
-        is_local: Optional[bool] = None
+        self,
+        model_name: Optional[str] = None,
+        pipeline: Optional[str] = None,
+        quantization: Optional[str] = None,
+        is_local: Optional[bool] = None,
     ):
         """Loads the model based on the specified pipeline and model name.
 
diff --git a/src/grag/components/multivec_retriever.py b/src/grag/components/multivec_retriever.py
index 40f9775..5e07024 100644
--- a/src/grag/components/multivec_retriever.py
+++ b/src/grag/components/multivec_retriever.py
@@ -30,7 +30,7 @@ class Retriever:
     linked document, chunk, etc.
 
     Attributes:
-        vectordb: ChromaClient class instance from components.client 
+        vectordb: ChromaClient class instance from components.client
                    (Optional, if the user provides it, store_path, id_key and namespace is not considered)
         store_path: Path to the local file store
         id_key: A key prefix for identifying documents
@@ -44,13 +44,13 @@ class Retriever:
     """
 
     def __init__(
-            self,
-            vectordb: Optional[VectorDB] = None,
-            store_path: Union[str, Path] = Path('data/doc_store'),
-            top_k: Union[str, int] = 3,
-            id_key: str = 'doc_id',
-            namespace: str = '71e4b558187b270922923569301f1039',
-            client_kwargs: Optional[Dict[str, Any]] = None,
+        self,
+        vectordb: Optional[VectorDB] = None,
+        store_path: Union[str, Path] = Path("data/doc_store"),
+        top_k: Union[str, int] = 3,
+        id_key: str = "doc_id",
+        namespace: str = "71e4b558187b270922923569301f1039",
+        client_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """Initialize the Retriever.
 
@@ -66,10 +66,12 @@ def __init__(
         self.id_key = id_key
         self.namespace = uuid.UUID(namespace)
         if vectordb is None:
-            if any([self.store_path is None,
-                    self.id_key is None,
-                    self.namespace is None]):
-                raise TypeError("Arguments [store_path, id_key, namespace] or vectordb must be provided.")
+            if any(
+                [self.store_path is None, self.id_key is None, self.namespace is None]
+            ):
+                raise TypeError(
+                    "Arguments [store_path, id_key, namespace] or vectordb must be provided."
+                )
             if client_kwargs is not None:
                 self.vectordb = DeepLakeClient(**client_kwargs)
             else:
@@ -241,12 +243,12 @@ def get_docs_from_chunks(self, chunks: List[Document], one_to_one=False):
                 return [d for d in docs if d is not None]
 
     def ingest(
-            self,
-            dir_path: Union[str, Path],
-            glob_pattern: str = "**/*.pdf",
-            dry_run: bool = False,
-            verbose: bool = True,
-            parser_kwargs: Optional[Dict[str, Any]] = None,
+        self,
+        dir_path: Union[str, Path],
+        glob_pattern: str = "**/*.pdf",
+        dry_run: bool = False,
+        verbose: bool = True,
+        parser_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """Ingests the files in directory.
 
@@ -283,12 +285,12 @@ def ingest(
                     print(f"DRY RUN: found - {filepath.relative_to(dir_path)}")
 
     async def aingest(
-            self,
-            dir_path: Union[str, Path],
-            glob_pattern: str = "**/*.pdf",
-            dry_run: bool = False,
-            verbose: bool = True,
-            parser_kwargs: Optional[Dict[str, Any]] = None,
+        self,
+        dir_path: Union[str, Path],
+        glob_pattern: str = "**/*.pdf",
+        dry_run: bool = False,
+        verbose: bool = True,
+        parser_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """Asynchronously ingests the files in directory.
 
diff --git a/src/grag/components/parse_pdf.py b/src/grag/components/parse_pdf.py
index 5ec4089..f9a4e21 100644
--- a/src/grag/components/parse_pdf.py
+++ b/src/grag/components/parse_pdf.py
@@ -24,22 +24,22 @@ class ParsePDF:
         image_output_dir (str): Directory to save extracted images, if any.
         add_captions_to_text (bool): Whether to include figure captions in text output. Default is True.
         add_captions_to_blocks (bool): Whether to add captions to table and image blocks. Default is True.
-        add_caption_first (bool): Whether to place captions before their corresponding image or table in the output. 
+        add_caption_first (bool): Whether to place captions before their corresponding image or table in the output.
                                   Default is True.
         table_as_html (bool): Whether to add table elements as HTML. Default is False.
     """
 
     def __init__(
-            self,
-            single_text_out: bool = True,
-            strategy: str = "hi_res",
-            infer_table_structure: bool = True,
-            extract_images: bool = True,
-            image_output_dir: Optional[str] = None,
-            add_captions_to_text: bool = True,
-            add_captions_to_blocks: bool = True,
-            add_caption_first: bool = True,
-            table_as_html: bool = False,
+        self,
+        single_text_out: bool = True,
+        strategy: str = "hi_res",
+        infer_table_structure: bool = True,
+        extract_images: bool = True,
+        image_output_dir: Optional[str] = None,
+        add_captions_to_text: bool = True,
+        add_captions_to_blocks: bool = True,
+        add_caption_first: bool = True,
+        table_as_html: bool = False,
     ):
         """Initialize instance variables with parameters."""
         self.strategy = strategy
@@ -98,7 +98,7 @@ def classify(self, partitions):
             if element.category == "Table":
                 if self.add_captions_to_blocks and i + 1 < len(partitions):
                     if (
-                            partitions[i + 1].category == "FigureCaption"
+                        partitions[i + 1].category == "FigureCaption"
                     ):  # check for caption
                         caption_element = partitions[i + 1]
                     else:
@@ -109,7 +109,7 @@ def classify(self, partitions):
             elif element.category == "Image":
                 if self.add_captions_to_blocks and i + 1 < len(partitions):
                     if (
-                            partitions[i + 1].category == "FigureCaption"
+                        partitions[i + 1].category == "FigureCaption"
                     ):  # check for caption
                         caption_element = partitions[i + 1]
                     else:
@@ -197,7 +197,7 @@ def process_tables(self, elements):
 
             if caption_element:
                 if (
-                        self.add_caption_first
+                    self.add_caption_first
                 ):  # if there is a caption, add that before the element
                     content = "\n\n".join([str(caption_element), table_data])
                 else:
diff --git a/src/grag/components/prompt.py b/src/grag/components/prompt.py
index 4c43e1a..7502a72 100644
--- a/src/grag/components/prompt.py
+++ b/src/grag/components/prompt.py
@@ -86,7 +86,7 @@ def __init__(self, **kwargs):
         )
 
     def save(
-            self, filepath: Union[Path, str, None], overwrite=False
+        self, filepath: Union[Path, str, None], overwrite=False
     ) -> Union[None, ValueError]:
         """Saves the prompt class into a json file."""
         dump = self.model_dump_json(indent=2, exclude_defaults=True, exclude_none=True)
diff --git a/src/grag/components/text_splitter.py b/src/grag/components/text_splitter.py
index 7b8275c..35959a0 100644
--- a/src/grag/components/text_splitter.py
+++ b/src/grag/components/text_splitter.py
@@ -21,9 +21,7 @@ class TextSplitter:
     """
 
     def __init__(
-            self,
-            chunk_size: Union[int, str] = 2000,
-            chunk_overlap: Union[int, str] = 400
+        self, chunk_size: Union[int, str] = 2000, chunk_overlap: Union[int, str] = 400
     ):
         """Initialize TextSplitter using chunk_size and chunk_overlap."""
         self.text_splitter = RecursiveCharacterTextSplitter(
diff --git a/src/grag/components/utils.py b/src/grag/components/utils.py
index 233d79c..19c9379 100644
--- a/src/grag/components/utils.py
+++ b/src/grag/components/utils.py
@@ -76,7 +76,7 @@ def get_config(load_env=False):
     if config_path_:
         config_path = Path(config_path_)
     else:
-        script_location = Path('.').resolve()
+        script_location = Path(".").resolve()
         config_path = find_config_path(script_location)
         if config_path is not None:
             os.environ["CONFIG_PATH"] = str(config_path)
@@ -86,9 +86,9 @@ def get_config(load_env=False):
         config = ConfigParser(interpolation=ExtendedInterpolation())
         config.read(config_path)
         print(f"Loaded config from {config_path}.")
-        # Load .env 
+        # Load .env
         if load_env:
-            env_path = Path(config['env']['env_path'])
+            env_path = Path(config["env"]["env_path"])
             if env_path.exists():
                 load_dotenv(env_path)
                 print(f"Loaded environment variables from {env_path}")
@@ -112,7 +112,7 @@ def configure_args(cls):
     Raises:
         TypeError: If there is a mismatch in provided arguments and class constructor requirements.
     """
-    module_namespace = cls.__module__.split('.')[-1]
+    module_namespace = cls.__module__.split(".")[-1]
 
     config = get_config()[module_namespace]
 
diff --git a/src/grag/components/vectordb/base.py b/src/grag/components/vectordb/base.py
index ee11c07..c831b5e 100644
--- a/src/grag/components/vectordb/base.py
+++ b/src/grag/components/vectordb/base.py
@@ -56,7 +56,7 @@ async def aadd_docs(self, docs: List[Document], verbose: bool = True) -> None:
 
     @abstractmethod
     def get_chunk(
-            self, query: str, with_score: bool = False, top_k: Optional[int] = None
+        self, query: str, with_score: bool = False, top_k: Optional[int] = None
     ) -> Union[List[Document], List[Tuple[Document, float]]]:
         """Returns the most similar chunks from the vector database.
 
@@ -72,7 +72,7 @@ def get_chunk(
 
     @abstractmethod
     async def aget_chunk(
-            self, query: str, with_score: bool = False, top_k: Optional[int] = None
+        self, query: str, with_score: bool = False, top_k: Optional[int] = None
     ) -> Union[List[Document], List[Tuple[Document, float]]]:
         """Returns the most similar chunks from the vector database (asynchronous).
 
diff --git a/src/grag/components/vectordb/chroma_client.py b/src/grag/components/vectordb/chroma_client.py
index ddb80f2..63c3b32 100644
--- a/src/grag/components/vectordb/chroma_client.py
+++ b/src/grag/components/vectordb/chroma_client.py
@@ -43,12 +43,12 @@ class ChromaClient(VectorDB):
     """
 
     def __init__(
-            self,
-            host: str = 'localhost',
-            port: Union[str, int] = 8000,
-            collection_name: str = 'grag',
-            embedding_type: str = 'instructor-embedding',
-            embedding_model: str = 'hkunlp/instructor-xl',
+        self,
+        host: str = "localhost",
+        port: Union[str, int] = 8000,
+        collection_name: str = "grag",
+        embedding_type: str = "instructor-embedding",
+        embedding_model: str = "hkunlp/instructor-xl",
     ):
         """Initialize a ChromaClient object.
 
@@ -56,9 +56,9 @@ def __init__(
             host: IP Address of hosted Chroma Vectorstore, defaults to localhost
             port: port address of hosted Chroma Vectorstore, defaults to 8000
             collection_name: name of the collection in the Chroma Vectorstore, defaults to 'grag'
-            embedding_type: type of embedding used, supported 'sentence-transformers' and 'instructor-embedding', 
+            embedding_type: type of embedding used, supported 'sentence-transformers' and 'instructor-embedding',
                             defaults to instructor-embedding
-            embedding_model: model name of embedding used, should correspond to the embedding_type, 
+            embedding_model: model name of embedding used, should correspond to the embedding_type,
                              defaults to hkunlp/instructor-xl.
         """
         self.host = host
@@ -127,7 +127,7 @@ def add_docs(self, docs: List[Document], verbose=True) -> None:
         """
         docs = self._filter_metadata(docs)
         for doc in (
-                tqdm(docs, desc=f"Adding to {self.collection_name}:") if verbose else docs
+            tqdm(docs, desc=f"Adding to {self.collection_name}:") if verbose else docs
         ):
             _id = self.langchain_client.add_documents([doc])
 
@@ -144,9 +144,9 @@ async def aadd_docs(self, docs: List[Document], verbose=True) -> None:
         docs = self._filter_metadata(docs)
         if verbose:
             for doc in atqdm(
-                    docs,
-                    desc=f"Adding documents to {self.collection_name}",
-                    total=len(docs),
+                docs,
+                desc=f"Adding documents to {self.collection_name}",
+                total=len(docs),
             ):
                 await self.langchain_client.aadd_documents([doc])
         else:
@@ -154,7 +154,7 @@ async def aadd_docs(self, docs: List[Document], verbose=True) -> None:
                 await self.langchain_client.aadd_documents([doc])
 
     def get_chunk(
-            self, query: str, with_score: bool = False, top_k: Optional[int] = None
+        self, query: str, with_score: bool = False, top_k: Optional[int] = None
     ) -> Union[List[Document], List[Tuple[Document, float]]]:
         """Returns the most similar chunks from the chroma database.
 
@@ -177,7 +177,7 @@ def get_chunk(
             )
 
     async def aget_chunk(
-            self, query: str, with_score=False, top_k=None
+        self, query: str, with_score=False, top_k=None
     ) -> Union[List[Document], List[Tuple[Document, float]]]:
         """Returns the most (cosine) similar chunks from the vector database, asynchronously.
 
diff --git a/src/grag/components/vectordb/deeplake_client.py b/src/grag/components/vectordb/deeplake_client.py
index 1eb57e1..9407cbc 100644
--- a/src/grag/components/vectordb/deeplake_client.py
+++ b/src/grag/components/vectordb/deeplake_client.py
@@ -39,21 +39,21 @@ class DeepLakeClient(VectorDB):
     """
 
     def __init__(
-            self,
-            store_path: Union[str, Path] = Path('data/vectordb'),
-            collection_name: str = 'grag',
-            embedding_type: str = 'instructor-embedding',
-            embedding_model: str = 'kunlp/instructor-xl',
-            read_only: bool = False,
+        self,
+        store_path: Union[str, Path] = Path("data/vectordb"),
+        collection_name: str = "grag",
+        embedding_type: str = "instructor-embedding",
+        embedding_model: str = "kunlp/instructor-xl",
+        read_only: bool = False,
     ):
         """Initialize a DeepLakeClient object.
 
         Args:
             store_path: path to the deeplake vectorstore, defaults to 'data/vectordb'
             collection_name: name of the collection in the DeepLake Vectorstore, defaults to 'grag'
-            embedding_type: type of embedding used, supported 'sentence-transformers' and 'instructor-embedding', 
+            embedding_type: type of embedding used, supported 'sentence-transformers' and 'instructor-embedding',
                             defaults to instructor-embedding
-            embedding_model: model name of embedding used, should correspond to the embedding_type, 
+            embedding_model: model name of embedding used, should correspond to the embedding_type,
                              defaults to hkunlp/instructor-xl
             read_only: flag indicating whether the client is read-only, defaults to False.
         """
@@ -96,7 +96,7 @@ def add_docs(self, docs: List[Document], verbose=True) -> None:
         """
         docs = self._filter_metadata(docs)
         for doc in (
-                tqdm(docs, desc=f"Adding to {self.collection_name}:") if verbose else docs
+            tqdm(docs, desc=f"Adding to {self.collection_name}:") if verbose else docs
         ):
             _id = self.langchain_client.add_documents([doc])
 
@@ -113,9 +113,9 @@ async def aadd_docs(self, docs: List[Document], verbose=True) -> None:
         docs = self._filter_metadata(docs)
         if verbose:
             for doc in atqdm(
-                    docs,
-                    desc=f"Adding documents to {self.collection_name}",
-                    total=len(docs),
+                docs,
+                desc=f"Adding documents to {self.collection_name}",
+                total=len(docs),
             ):
                 await self.langchain_client.aadd_documents([doc])
         else:
@@ -123,7 +123,7 @@ async def aadd_docs(self, docs: List[Document], verbose=True) -> None:
                 await self.langchain_client.aadd_documents([doc])
 
     def get_chunk(
-            self, query: str, with_score: bool = False, top_k: Optional[int] = None
+        self, query: str, with_score: bool = False, top_k: Optional[int] = None
     ) -> Union[List[Document], List[Tuple[Document, float]]]:
         """Returns the most similar chunks from the deeplake database.
 
@@ -146,7 +146,7 @@ def get_chunk(
             )
 
     async def aget_chunk(
-            self, query: str, with_score=False, top_k=None
+        self, query: str, with_score=False, top_k=None
     ) -> Union[List[Document], List[Tuple[Document, float]]]:
         """Returns the most similar chunks from the deeplake database, asynchronously.
 
diff --git a/src/grag/rag/basic_rag.py b/src/grag/rag/basic_rag.py
index fa76424..2726e95 100644
--- a/src/grag/rag/basic_rag.py
+++ b/src/grag/rag/basic_rag.py
@@ -45,7 +45,7 @@ def __init__(
         """Initialize BasicRAG."""
         if retriever is None:
             if retriever_kwargs is None:
-                self.retriever = Retriever(client_kwargs={'read_only': True})
+                self.retriever = Retriever(client_kwargs={"read_only": True})
             else:
                 self.retriever = Retriever(**retriever_kwargs)
         else:
@@ -216,7 +216,7 @@ def refine_chain(self, query: str):
         prompt = self.refine_prompt.format(
             context=retrieved_docs[-1].page_content,
             question=query,
-            existing_answer=responses[-1]
+            existing_answer=responses[-1],
         )
         return prompt, retrieved_docs, responses