From 515a672bfb197b072eb8dc13f17777237c6a2b3d Mon Sep 17 00:00:00 2001 From: Arjun Bingly Date: Tue, 30 Apr 2024 20:25:16 -0400 Subject: [PATCH] Ruff format patch --- cookbook/RAG-GUI/app.py | 3 +- src/grag/components/create_config.py | 14 +++--- src/grag/components/llm.py | 15 +++--- src/grag/components/multivec_retriever.py | 50 ++++++++++--------- src/grag/components/parse_pdf.py | 28 +++++------ src/grag/components/prompt.py | 2 +- src/grag/components/text_splitter.py | 4 +- src/grag/components/utils.py | 8 +-- src/grag/components/vectordb/base.py | 4 +- src/grag/components/vectordb/chroma_client.py | 28 +++++------ .../components/vectordb/deeplake_client.py | 28 +++++------ src/grag/rag/basic_rag.py | 4 +- 12 files changed, 95 insertions(+), 93 deletions(-) diff --git a/cookbook/RAG-GUI/app.py b/cookbook/RAG-GUI/app.py index b2f3085..588da45 100644 --- a/cookbook/RAG-GUI/app.py +++ b/cookbook/RAG-GUI/app.py @@ -1,5 +1,4 @@ -""" -RAG-GUI +"""RAG-GUI ======= A cookbook demonstrating how to run a RAG app on streamlit. diff --git a/src/grag/components/create_config.py b/src/grag/components/create_config.py index 9708a18..20bf943 100644 --- a/src/grag/components/create_config.py +++ b/src/grag/components/create_config.py @@ -8,7 +8,7 @@ from importlib_resources import files -def create_config(path: Union[str, Path] = '.') -> None: +def create_config(path: Union[str, Path] = ".") -> None: """Create a configuration file if it doesn't exist. This function checks for the existence of a 'config.ini' file at the given path. @@ -17,8 +17,8 @@ def create_config(path: Union[str, Path] = '.') -> None: and does not overwrite the existing file. Args: - path (Union[str, Path]): The directory path where the 'config.ini' should be - located. If not specified, defaults to the current + path (Union[str, Path]): The directory path where the 'config.ini' should be + located. If not specified, defaults to the current directory ('.'). Returns: @@ -29,15 +29,15 @@ def create_config(path: Union[str, Path] = '.') -> None: PermissionError: If the process does not have permission to write to the specified directory. """ - default_config_path = files(grag.resources).joinpath('default_config.ini') - path = Path(path) / 'config.ini' + default_config_path = files(grag.resources).joinpath("default_config.ini") + path = Path(path) / "config.ini" path = path.resolve() if path.exists(): - print('Config file already exists') + print("Config file already exists") else: shutil.copyfile(default_config_path, path, follow_symlinks=True) print(f"Created config file at {path}") -if __name__ == '__main__': +if __name__ == "__main__": create_config() diff --git a/src/grag/components/llm.py b/src/grag/components/llm.py index 3560387..f7ede73 100644 --- a/src/grag/components/llm.py +++ b/src/grag/components/llm.py @@ -47,15 +47,15 @@ def __init__( model_name: str, quantization: str, pipeline: str, - device_map: str = 'auto', - task: str = 'text-generation', - max_new_tokens: str = '1024', + device_map: str = "auto", + task: str = "text-generation", + max_new_tokens: str = "1024", temperature: Union[str, int] = 0.1, n_batch: Union[str, int] = 1024, n_ctx: Union[str, int] = 6000, n_gpu_layers: Union[str, int] = -1, std_out: Union[bool, str] = True, - base_dir: Union[str, Path] = Path('models'), + base_dir: Union[str, Path] = Path("models"), callbacks=None, ): """Initialize the LLM class using the given parameters. @@ -184,8 +184,11 @@ def llama_cpp(self): return llm def load_model( - self, model_name: Optional[str] = None, pipeline: Optional[str] = None, quantization: Optional[str] = None, - is_local: Optional[bool] = None + self, + model_name: Optional[str] = None, + pipeline: Optional[str] = None, + quantization: Optional[str] = None, + is_local: Optional[bool] = None, ): """Loads the model based on the specified pipeline and model name. diff --git a/src/grag/components/multivec_retriever.py b/src/grag/components/multivec_retriever.py index 40f9775..5e07024 100644 --- a/src/grag/components/multivec_retriever.py +++ b/src/grag/components/multivec_retriever.py @@ -30,7 +30,7 @@ class Retriever: linked document, chunk, etc. Attributes: - vectordb: ChromaClient class instance from components.client + vectordb: ChromaClient class instance from components.client (Optional, if the user provides it, store_path, id_key and namespace is not considered) store_path: Path to the local file store id_key: A key prefix for identifying documents @@ -44,13 +44,13 @@ class Retriever: """ def __init__( - self, - vectordb: Optional[VectorDB] = None, - store_path: Union[str, Path] = Path('data/doc_store'), - top_k: Union[str, int] = 3, - id_key: str = 'doc_id', - namespace: str = '71e4b558187b270922923569301f1039', - client_kwargs: Optional[Dict[str, Any]] = None, + self, + vectordb: Optional[VectorDB] = None, + store_path: Union[str, Path] = Path("data/doc_store"), + top_k: Union[str, int] = 3, + id_key: str = "doc_id", + namespace: str = "71e4b558187b270922923569301f1039", + client_kwargs: Optional[Dict[str, Any]] = None, ): """Initialize the Retriever. @@ -66,10 +66,12 @@ def __init__( self.id_key = id_key self.namespace = uuid.UUID(namespace) if vectordb is None: - if any([self.store_path is None, - self.id_key is None, - self.namespace is None]): - raise TypeError("Arguments [store_path, id_key, namespace] or vectordb must be provided.") + if any( + [self.store_path is None, self.id_key is None, self.namespace is None] + ): + raise TypeError( + "Arguments [store_path, id_key, namespace] or vectordb must be provided." + ) if client_kwargs is not None: self.vectordb = DeepLakeClient(**client_kwargs) else: @@ -241,12 +243,12 @@ def get_docs_from_chunks(self, chunks: List[Document], one_to_one=False): return [d for d in docs if d is not None] def ingest( - self, - dir_path: Union[str, Path], - glob_pattern: str = "**/*.pdf", - dry_run: bool = False, - verbose: bool = True, - parser_kwargs: Optional[Dict[str, Any]] = None, + self, + dir_path: Union[str, Path], + glob_pattern: str = "**/*.pdf", + dry_run: bool = False, + verbose: bool = True, + parser_kwargs: Optional[Dict[str, Any]] = None, ): """Ingests the files in directory. @@ -283,12 +285,12 @@ def ingest( print(f"DRY RUN: found - {filepath.relative_to(dir_path)}") async def aingest( - self, - dir_path: Union[str, Path], - glob_pattern: str = "**/*.pdf", - dry_run: bool = False, - verbose: bool = True, - parser_kwargs: Optional[Dict[str, Any]] = None, + self, + dir_path: Union[str, Path], + glob_pattern: str = "**/*.pdf", + dry_run: bool = False, + verbose: bool = True, + parser_kwargs: Optional[Dict[str, Any]] = None, ): """Asynchronously ingests the files in directory. diff --git a/src/grag/components/parse_pdf.py b/src/grag/components/parse_pdf.py index 5ec4089..f9a4e21 100644 --- a/src/grag/components/parse_pdf.py +++ b/src/grag/components/parse_pdf.py @@ -24,22 +24,22 @@ class ParsePDF: image_output_dir (str): Directory to save extracted images, if any. add_captions_to_text (bool): Whether to include figure captions in text output. Default is True. add_captions_to_blocks (bool): Whether to add captions to table and image blocks. Default is True. - add_caption_first (bool): Whether to place captions before their corresponding image or table in the output. + add_caption_first (bool): Whether to place captions before their corresponding image or table in the output. Default is True. table_as_html (bool): Whether to add table elements as HTML. Default is False. """ def __init__( - self, - single_text_out: bool = True, - strategy: str = "hi_res", - infer_table_structure: bool = True, - extract_images: bool = True, - image_output_dir: Optional[str] = None, - add_captions_to_text: bool = True, - add_captions_to_blocks: bool = True, - add_caption_first: bool = True, - table_as_html: bool = False, + self, + single_text_out: bool = True, + strategy: str = "hi_res", + infer_table_structure: bool = True, + extract_images: bool = True, + image_output_dir: Optional[str] = None, + add_captions_to_text: bool = True, + add_captions_to_blocks: bool = True, + add_caption_first: bool = True, + table_as_html: bool = False, ): """Initialize instance variables with parameters.""" self.strategy = strategy @@ -98,7 +98,7 @@ def classify(self, partitions): if element.category == "Table": if self.add_captions_to_blocks and i + 1 < len(partitions): if ( - partitions[i + 1].category == "FigureCaption" + partitions[i + 1].category == "FigureCaption" ): # check for caption caption_element = partitions[i + 1] else: @@ -109,7 +109,7 @@ def classify(self, partitions): elif element.category == "Image": if self.add_captions_to_blocks and i + 1 < len(partitions): if ( - partitions[i + 1].category == "FigureCaption" + partitions[i + 1].category == "FigureCaption" ): # check for caption caption_element = partitions[i + 1] else: @@ -197,7 +197,7 @@ def process_tables(self, elements): if caption_element: if ( - self.add_caption_first + self.add_caption_first ): # if there is a caption, add that before the element content = "\n\n".join([str(caption_element), table_data]) else: diff --git a/src/grag/components/prompt.py b/src/grag/components/prompt.py index 4c43e1a..7502a72 100644 --- a/src/grag/components/prompt.py +++ b/src/grag/components/prompt.py @@ -86,7 +86,7 @@ def __init__(self, **kwargs): ) def save( - self, filepath: Union[Path, str, None], overwrite=False + self, filepath: Union[Path, str, None], overwrite=False ) -> Union[None, ValueError]: """Saves the prompt class into a json file.""" dump = self.model_dump_json(indent=2, exclude_defaults=True, exclude_none=True) diff --git a/src/grag/components/text_splitter.py b/src/grag/components/text_splitter.py index 7b8275c..35959a0 100644 --- a/src/grag/components/text_splitter.py +++ b/src/grag/components/text_splitter.py @@ -21,9 +21,7 @@ class TextSplitter: """ def __init__( - self, - chunk_size: Union[int, str] = 2000, - chunk_overlap: Union[int, str] = 400 + self, chunk_size: Union[int, str] = 2000, chunk_overlap: Union[int, str] = 400 ): """Initialize TextSplitter using chunk_size and chunk_overlap.""" self.text_splitter = RecursiveCharacterTextSplitter( diff --git a/src/grag/components/utils.py b/src/grag/components/utils.py index 233d79c..19c9379 100644 --- a/src/grag/components/utils.py +++ b/src/grag/components/utils.py @@ -76,7 +76,7 @@ def get_config(load_env=False): if config_path_: config_path = Path(config_path_) else: - script_location = Path('.').resolve() + script_location = Path(".").resolve() config_path = find_config_path(script_location) if config_path is not None: os.environ["CONFIG_PATH"] = str(config_path) @@ -86,9 +86,9 @@ def get_config(load_env=False): config = ConfigParser(interpolation=ExtendedInterpolation()) config.read(config_path) print(f"Loaded config from {config_path}.") - # Load .env + # Load .env if load_env: - env_path = Path(config['env']['env_path']) + env_path = Path(config["env"]["env_path"]) if env_path.exists(): load_dotenv(env_path) print(f"Loaded environment variables from {env_path}") @@ -112,7 +112,7 @@ def configure_args(cls): Raises: TypeError: If there is a mismatch in provided arguments and class constructor requirements. """ - module_namespace = cls.__module__.split('.')[-1] + module_namespace = cls.__module__.split(".")[-1] config = get_config()[module_namespace] diff --git a/src/grag/components/vectordb/base.py b/src/grag/components/vectordb/base.py index ee11c07..c831b5e 100644 --- a/src/grag/components/vectordb/base.py +++ b/src/grag/components/vectordb/base.py @@ -56,7 +56,7 @@ async def aadd_docs(self, docs: List[Document], verbose: bool = True) -> None: @abstractmethod def get_chunk( - self, query: str, with_score: bool = False, top_k: Optional[int] = None + self, query: str, with_score: bool = False, top_k: Optional[int] = None ) -> Union[List[Document], List[Tuple[Document, float]]]: """Returns the most similar chunks from the vector database. @@ -72,7 +72,7 @@ def get_chunk( @abstractmethod async def aget_chunk( - self, query: str, with_score: bool = False, top_k: Optional[int] = None + self, query: str, with_score: bool = False, top_k: Optional[int] = None ) -> Union[List[Document], List[Tuple[Document, float]]]: """Returns the most similar chunks from the vector database (asynchronous). diff --git a/src/grag/components/vectordb/chroma_client.py b/src/grag/components/vectordb/chroma_client.py index ddb80f2..63c3b32 100644 --- a/src/grag/components/vectordb/chroma_client.py +++ b/src/grag/components/vectordb/chroma_client.py @@ -43,12 +43,12 @@ class ChromaClient(VectorDB): """ def __init__( - self, - host: str = 'localhost', - port: Union[str, int] = 8000, - collection_name: str = 'grag', - embedding_type: str = 'instructor-embedding', - embedding_model: str = 'hkunlp/instructor-xl', + self, + host: str = "localhost", + port: Union[str, int] = 8000, + collection_name: str = "grag", + embedding_type: str = "instructor-embedding", + embedding_model: str = "hkunlp/instructor-xl", ): """Initialize a ChromaClient object. @@ -56,9 +56,9 @@ def __init__( host: IP Address of hosted Chroma Vectorstore, defaults to localhost port: port address of hosted Chroma Vectorstore, defaults to 8000 collection_name: name of the collection in the Chroma Vectorstore, defaults to 'grag' - embedding_type: type of embedding used, supported 'sentence-transformers' and 'instructor-embedding', + embedding_type: type of embedding used, supported 'sentence-transformers' and 'instructor-embedding', defaults to instructor-embedding - embedding_model: model name of embedding used, should correspond to the embedding_type, + embedding_model: model name of embedding used, should correspond to the embedding_type, defaults to hkunlp/instructor-xl. """ self.host = host @@ -127,7 +127,7 @@ def add_docs(self, docs: List[Document], verbose=True) -> None: """ docs = self._filter_metadata(docs) for doc in ( - tqdm(docs, desc=f"Adding to {self.collection_name}:") if verbose else docs + tqdm(docs, desc=f"Adding to {self.collection_name}:") if verbose else docs ): _id = self.langchain_client.add_documents([doc]) @@ -144,9 +144,9 @@ async def aadd_docs(self, docs: List[Document], verbose=True) -> None: docs = self._filter_metadata(docs) if verbose: for doc in atqdm( - docs, - desc=f"Adding documents to {self.collection_name}", - total=len(docs), + docs, + desc=f"Adding documents to {self.collection_name}", + total=len(docs), ): await self.langchain_client.aadd_documents([doc]) else: @@ -154,7 +154,7 @@ async def aadd_docs(self, docs: List[Document], verbose=True) -> None: await self.langchain_client.aadd_documents([doc]) def get_chunk( - self, query: str, with_score: bool = False, top_k: Optional[int] = None + self, query: str, with_score: bool = False, top_k: Optional[int] = None ) -> Union[List[Document], List[Tuple[Document, float]]]: """Returns the most similar chunks from the chroma database. @@ -177,7 +177,7 @@ def get_chunk( ) async def aget_chunk( - self, query: str, with_score=False, top_k=None + self, query: str, with_score=False, top_k=None ) -> Union[List[Document], List[Tuple[Document, float]]]: """Returns the most (cosine) similar chunks from the vector database, asynchronously. diff --git a/src/grag/components/vectordb/deeplake_client.py b/src/grag/components/vectordb/deeplake_client.py index 1eb57e1..9407cbc 100644 --- a/src/grag/components/vectordb/deeplake_client.py +++ b/src/grag/components/vectordb/deeplake_client.py @@ -39,21 +39,21 @@ class DeepLakeClient(VectorDB): """ def __init__( - self, - store_path: Union[str, Path] = Path('data/vectordb'), - collection_name: str = 'grag', - embedding_type: str = 'instructor-embedding', - embedding_model: str = 'kunlp/instructor-xl', - read_only: bool = False, + self, + store_path: Union[str, Path] = Path("data/vectordb"), + collection_name: str = "grag", + embedding_type: str = "instructor-embedding", + embedding_model: str = "kunlp/instructor-xl", + read_only: bool = False, ): """Initialize a DeepLakeClient object. Args: store_path: path to the deeplake vectorstore, defaults to 'data/vectordb' collection_name: name of the collection in the DeepLake Vectorstore, defaults to 'grag' - embedding_type: type of embedding used, supported 'sentence-transformers' and 'instructor-embedding', + embedding_type: type of embedding used, supported 'sentence-transformers' and 'instructor-embedding', defaults to instructor-embedding - embedding_model: model name of embedding used, should correspond to the embedding_type, + embedding_model: model name of embedding used, should correspond to the embedding_type, defaults to hkunlp/instructor-xl read_only: flag indicating whether the client is read-only, defaults to False. """ @@ -96,7 +96,7 @@ def add_docs(self, docs: List[Document], verbose=True) -> None: """ docs = self._filter_metadata(docs) for doc in ( - tqdm(docs, desc=f"Adding to {self.collection_name}:") if verbose else docs + tqdm(docs, desc=f"Adding to {self.collection_name}:") if verbose else docs ): _id = self.langchain_client.add_documents([doc]) @@ -113,9 +113,9 @@ async def aadd_docs(self, docs: List[Document], verbose=True) -> None: docs = self._filter_metadata(docs) if verbose: for doc in atqdm( - docs, - desc=f"Adding documents to {self.collection_name}", - total=len(docs), + docs, + desc=f"Adding documents to {self.collection_name}", + total=len(docs), ): await self.langchain_client.aadd_documents([doc]) else: @@ -123,7 +123,7 @@ async def aadd_docs(self, docs: List[Document], verbose=True) -> None: await self.langchain_client.aadd_documents([doc]) def get_chunk( - self, query: str, with_score: bool = False, top_k: Optional[int] = None + self, query: str, with_score: bool = False, top_k: Optional[int] = None ) -> Union[List[Document], List[Tuple[Document, float]]]: """Returns the most similar chunks from the deeplake database. @@ -146,7 +146,7 @@ def get_chunk( ) async def aget_chunk( - self, query: str, with_score=False, top_k=None + self, query: str, with_score=False, top_k=None ) -> Union[List[Document], List[Tuple[Document, float]]]: """Returns the most similar chunks from the deeplake database, asynchronously. diff --git a/src/grag/rag/basic_rag.py b/src/grag/rag/basic_rag.py index fa76424..2726e95 100644 --- a/src/grag/rag/basic_rag.py +++ b/src/grag/rag/basic_rag.py @@ -45,7 +45,7 @@ def __init__( """Initialize BasicRAG.""" if retriever is None: if retriever_kwargs is None: - self.retriever = Retriever(client_kwargs={'read_only': True}) + self.retriever = Retriever(client_kwargs={"read_only": True}) else: self.retriever = Retriever(**retriever_kwargs) else: @@ -216,7 +216,7 @@ def refine_chain(self, query: str): prompt = self.refine_prompt.format( context=retrieved_docs[-1].page_content, question=query, - existing_answer=responses[-1] + existing_answer=responses[-1], ) return prompt, retrieved_docs, responses