diff --git a/default_config.ini b/default_config.ini new file mode 100644 index 0000000..c1ea6c3 --- /dev/null +++ b/default_config.ini @@ -0,0 +1,63 @@ +; This is the default config.ini file generated by GRAG +; All values are same as package defaults +; Values that do not have a default value are commented out + +[llm] +;model_name : Llama-2-13b-chat +;quantization : Q5_K_M +;pipeline : llama_cpp +device_map : auto +task : text-generation +max_new_tokens : 1024 +temperature : 0.1 +n_batch : 1024 +n_ctx : 6000 +n_gpu_layers : -1 +std_out : True +base_dir : ${root:root_path}/models + +[chroma_client] +host : localhost +port : 8000 +collection_name : grag +embedding_type : instructor-embedding +embedding_model : hkunlp/instructor-xl + +[deeplake_client] +collection_name : grag +embedding_type : instructor-embedding +embedding_model : hkunlp/instructor-xl +store_path : ${data:data_path}/vectordb + +[text_splitter] +chunk_size : 2000 +chunk_overlap : 400 + +[multivec_retriever] +store_path : ${data:data_path}/doc_store +top_k : 3 +id_key : doc_id +namespace : 71e4b558187b270922923569301f1039 + +[parse_pdf] +single_text_out : True +strategy : hi_res +infer_table_structure : True +extract_images : True +image_output_dir : None +add_captions_to_text : True +add_captions_to_blocks : True +table_as_html : False + +[data] +data_path : ${root:root_path}/data + +[env] +env_path : ${root:root_path}/.env + +[root] +root_path : ~/Capstone_5 +; check if ~ works + +[quantize] +llama_cpp_path : ${root:root_path} diff --git a/src/grag/components/embedding.py b/src/grag/components/embedding.py index a29e2e2..8b24d45 100644 --- a/src/grag/components/embedding.py +++ b/src/grag/components/embedding.py @@ -2,7 +2,7 @@ This module provides: -- Embedding +— Embedding """ from langchain_community.embeddings import HuggingFaceInstructEmbeddings diff --git a/src/grag/components/llm.py b/src/grag/components/llm.py index 1786941..3560387 100644 --- a/src/grag/components/llm.py +++ b/src/grag/components/llm.py @@ -22,34 +22,59 @@ class LLM: """A class for managing and utilizing large language models (LLMs). + This class facilitates the loading and operation of large language models using different pipelines and settings. + It supports both local and Hugging Face-based model management, with adjustable parameters for quantization, + computational specifics, and output control. + Attributes: model_name (str): Name of the model to be loaded. - device_map (dict): Device mapping for model execution. - task (str): The task for which the model is being used. - max_new_tokens (int): Maximum new tokens to be generated. - temperature (float): Sampling temperature for generation. - n_batch (int): Number of batches for GPU CPP. - n_ctx (int): Context size for CPP. - n_gpu_layers (int): Number of GPU layers for CPP. + quantization (str): Quantization setting for the model, affecting performance and memory usage. + pipeline (str): Type of pipeline ('llama_cpp' or 'hf') used for model operations. + device_map (str): Device mapping for model execution, defaults to 'auto'. + task (str): The task for which the model is being used, defaults to 'text-generation'. + max_new_tokens (int): Maximum number of new tokens to be generated, defaults to 1024. + temperature (float): Sampling temperature for generation, affecting randomness. + n_batch (int): Number of batches for GPU CPP, impacting batch processing. + n_ctx (int): Context size for CPP, defining the extent of context considered. + n_gpu_layers (int): Number of GPU layers for CPP, specifying computational depth. + std_out (bool or str): Flag or descriptor for standard output during operations. + base_dir (str or Path): Base directory path for model files, defaults to 'models'. + callbacks (list or None): List of callback functions for additional processing. """ def __init__( - self, - model_name: str, - device_map: str, - task: str, - max_new_tokens: str, - temperature: str, - n_batch: str, - n_ctx: str, - n_gpu_layers: str, - std_out: Union[bool, str], - base_dir: str, - quantization: str, - pipeline: str, - callbacks=None, + self, + model_name: str, + quantization: str, + pipeline: str, + device_map: str = 'auto', + task: str = 'text-generation', + max_new_tokens: str = '1024', + temperature: Union[str, int] = 0.1, + n_batch: Union[str, int] = 1024, + n_ctx: Union[str, int] = 6000, + n_gpu_layers: Union[str, int] = -1, + std_out: Union[bool, str] = True, + base_dir: Union[str, Path] = Path('models'), + callbacks=None, ): - """Initialize the LLM class using the given parameters.""" + """Initialize the LLM class using the given parameters. + + Args: + model_name (str): Specifies the model name. + quantization (str): Sets the model's quantization configuration. + pipeline (str): Determines which pipeline to use for model operations. + device_map (str, optional): Device configuration for model deployment. + task (str, optional): Defines the specific task or use-case of the model. + max_new_tokens (int, optional): Limits the number of tokens generated in one operation. + temperature (float, optional): Controls the generation randomness. + n_batch (int, optional): Adjusts batch processing size. + n_ctx (int, optional): Configures the context size used in model operations. + n_gpu_layers (int, optional): Sets the depth of computation in GPU layers. + std_out (bool or str, optional): Manages standard output settings. + base_dir (str or Path, optional): Specifies the directory for storing model files. + callbacks (list, optional): Provides custom callback functions for runtime. + """ self.base_dir = Path(base_dir) self._model_name = model_name self.quantization = quantization @@ -159,8 +184,8 @@ def llama_cpp(self): return llm def load_model( - self, model_name: Optional[str] = None, pipeline: Optional[str] = None, quantization: Optional[str] = None, - is_local: Optional[bool] = None + self, model_name: Optional[str] = None, pipeline: Optional[str] = None, quantization: Optional[str] = None, + is_local: Optional[bool] = None ): """Loads the model based on the specified pipeline and model name. diff --git a/src/grag/components/multivec_retriever.py b/src/grag/components/multivec_retriever.py index 3f85fe9..40f9775 100644 --- a/src/grag/components/multivec_retriever.py +++ b/src/grag/components/multivec_retriever.py @@ -2,7 +2,7 @@ This module provides: -- Retriever +— Retriever """ import uuid @@ -30,9 +30,10 @@ class Retriever: linked document, chunk, etc. Attributes: + vectordb: ChromaClient class instance from components.client + (Optional, if the user provides it, store_path, id_key and namespace is not considered) store_path: Path to the local file store id_key: A key prefix for identifying documents - vectordb: ChromaClient class instance from components.client store: langchain.storage.LocalFileStore object, stores the key value pairs of document id and parent file retriever: langchain.retrievers.multi_vector.MultiVectorRetriever class instance, langchain's multi-vector retriever @@ -43,23 +44,23 @@ class Retriever: """ def __init__( - self, - store_path: Union[str, Path], - top_k: str, - id_key: str, - vectordb: Optional[VectorDB] = None, - namespace: Optional[str] = None, - client_kwargs: Optional[Dict[str, Any]] = None, + self, + vectordb: Optional[VectorDB] = None, + store_path: Union[str, Path] = Path('data/doc_store'), + top_k: Union[str, int] = 3, + id_key: str = 'doc_id', + namespace: str = '71e4b558187b270922923569301f1039', + client_kwargs: Optional[Dict[str, Any]] = None, ): """Initialize the Retriever. Args: vectordb: Vector DB client instance - store_path: Path to the local file store, defaults to argument from config file - id_key: A key prefix for identifying documents, defaults to argument from config file - namespace: A namespace for producing unique id, defaults to argument from congig file - top_k: Number of top chunks to return from similarity search, defaults to 1 - client_kwargs: kwargs to pass to the vectordb client + store_path: Path to the local file store, defaults to data/doc_store + id_key: A key prefix for identifying documents, defaults to 'doc_id' + namespace: A namespace for producing unique id + top_k: Number of top chunks to return from similarity search, defaults to 3 + client_kwargs: kwargs to pass to the vectordb client constructor, optional, defaults to None """ self.store_path = store_path self.id_key = id_key @@ -89,7 +90,7 @@ def __init__( def id_gen(self, doc: Document) -> str: """Takes a document and returns a unique id (uuid5) using the namespace and document source. - This ensures that a single document always gets the same unique id. + This ensures that a single document always gets the same unique id. Args: doc: langchain_core.documents.Document @@ -240,12 +241,12 @@ def get_docs_from_chunks(self, chunks: List[Document], one_to_one=False): return [d for d in docs if d is not None] def ingest( - self, - dir_path: Union[str, Path], - glob_pattern: str = "**/*.pdf", - dry_run: bool = False, - verbose: bool = True, - parser_kwargs: Optional[Dict[str, Any]] = None, + self, + dir_path: Union[str, Path], + glob_pattern: str = "**/*.pdf", + dry_run: bool = False, + verbose: bool = True, + parser_kwargs: Optional[Dict[str, Any]] = None, ): """Ingests the files in directory. @@ -282,12 +283,12 @@ def ingest( print(f"DRY RUN: found - {filepath.relative_to(dir_path)}") async def aingest( - self, - dir_path: Union[str, Path], - glob_pattern: str = "**/*.pdf", - dry_run: bool = False, - verbose: bool = True, - parser_kwargs: Optional[Dict[str, Any]] = None, + self, + dir_path: Union[str, Path], + glob_pattern: str = "**/*.pdf", + dry_run: bool = False, + verbose: bool = True, + parser_kwargs: Optional[Dict[str, Any]] = None, ): """Asynchronously ingests the files in directory. diff --git a/src/grag/components/parse_pdf.py b/src/grag/components/parse_pdf.py index 3636098..5ec4089 100644 --- a/src/grag/components/parse_pdf.py +++ b/src/grag/components/parse_pdf.py @@ -2,9 +2,11 @@ This module provides: -- ParsePDF +— ParsePDF """ +from typing import Optional + from grag.components.utils import configure_args from langchain_core.documents import Document from unstructured.partition.pdf import partition_pdf @@ -17,25 +19,27 @@ class ParsePDF: Attributes: single_text_out (bool): Whether to combine all text elements into a single output document. strategy (str): The strategy for PDF partitioning; default is "hi_res" for better accuracy. - extract_image_block_types (list): Elements to be extracted as image blocks. infer_table_structure (bool): Whether to extract tables during partitioning. extract_images (bool): Whether to extract images. image_output_dir (str): Directory to save extracted images, if any. add_captions_to_text (bool): Whether to include figure captions in text output. Default is True. add_captions_to_blocks (bool): Whether to add captions to table and image blocks. Default is True. - add_caption_first (bool): Whether to place captions before their corresponding image or table in the output. Default is True. + add_caption_first (bool): Whether to place captions before their corresponding image or table in the output. + Default is True. + table_as_html (bool): Whether to add table elements as HTML. Default is False. """ def __init__( self, - single_text_out, - strategy, - infer_table_structure, - extract_images, - image_output_dir, - add_captions_to_text, - add_captions_to_blocks, - table_as_html, + single_text_out: bool = True, + strategy: str = "hi_res", + infer_table_structure: bool = True, + extract_images: bool = True, + image_output_dir: Optional[str] = None, + add_captions_to_text: bool = True, + add_captions_to_blocks: bool = True, + add_caption_first: bool = True, + table_as_html: bool = False, ): """Initialize instance variables with parameters.""" self.strategy = strategy @@ -51,7 +55,7 @@ def __init__( self.add_captions_to_blocks = add_captions_to_blocks self.image_output_dir = image_output_dir self.single_text_out = single_text_out - self.add_caption_first = True + self.add_caption_first = add_caption_first self.table_as_html = table_as_html def partition(self, path: str): diff --git a/src/grag/components/prompt.py b/src/grag/components/prompt.py index 7cfd40b..4c43e1a 100644 --- a/src/grag/components/prompt.py +++ b/src/grag/components/prompt.py @@ -2,9 +2,9 @@ This module provides: -- Prompt - for generic prompts +— Prompt: for generic prompts -- FewShotPrompt - for few-shot prompts +— FewShotPrompt: for few-shot prompts """ import json @@ -86,7 +86,7 @@ def __init__(self, **kwargs): ) def save( - self, filepath: Union[Path, str, None], overwrite=False + self, filepath: Union[Path, str, None], overwrite=False ) -> Union[None, ValueError]: """Saves the prompt class into a json file.""" dump = self.model_dump_json(indent=2, exclude_defaults=True, exclude_none=True) diff --git a/src/grag/components/text_splitter.py b/src/grag/components/text_splitter.py index 025ec72..7b8275c 100644 --- a/src/grag/components/text_splitter.py +++ b/src/grag/components/text_splitter.py @@ -2,7 +2,7 @@ This module provides: -- TextSplitter +— TextSplitter """ from typing import Union @@ -22,14 +22,13 @@ class TextSplitter: def __init__( self, - chunk_size: Union[int, str], - chunk_overlap: Union[int, str] + chunk_size: Union[int, str] = 2000, + chunk_overlap: Union[int, str] = 400 ): - """Initialize TextSplitter.""" + """Initialize TextSplitter using chunk_size and chunk_overlap.""" self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap), length_function=len, is_separator_regex=False, ) - """Initialize TextSplitter using chunk_size and chunk_overlap""" diff --git a/src/grag/components/utils.py b/src/grag/components/utils.py index 991550c..233d79c 100644 --- a/src/grag/components/utils.py +++ b/src/grag/components/utils.py @@ -2,13 +2,15 @@ This module provides: -- stuff_docs: concats langchain documents into string +— stuff_docs: concats langchain documents into string -- load_prompt: loads json prompt to langchain prompt +— load_prompt: loads json prompt to langchain prompt -- find_config_path: finds the path of the 'config.ini' file by traversing up the directory tree from the current path. +— find_config_path: finds the path of the 'config.ini' file by traversing up the directory tree from the current path. -- get_config: retrieves and parses the configuration settings from the 'config.ini' file. +— get_config: retrieves and parses the configuration settings from the 'config.ini' file. + +— configure_args: a decorator to configure class instantiation arguments from a 'config.ini' file. """ import os @@ -38,7 +40,7 @@ def find_config_path(current_path: Path): """Finds the path of the 'config.ini' file by traversing up the directory tree from the current path. This function starts at the current path and moves up the directory tree until it finds a file named 'config.ini'. - If 'config.ini' is not found by the time the root of the directory tree is reached, a FileNotFoundError is raised. + If 'config.ini' is not found by the time the root of the directory tree is reached, None is returned. Args: current_path (Path): The starting point for the search, typically the location of the script being executed. @@ -61,6 +63,7 @@ def get_config(load_env=False): This function locates the 'config.ini' file by calling `find_config_path` using the script's current location. It initializes a `ConfigParser` object to read the configuration settings from the located 'config.ini' file. Optionally, it can also load environment variables from a `.env` file specified in the config. + If a config file cannot be read, a default dictionary is returned. Args: load_env (bool): If True, load environment variables from the path specified in the 'config.ini'. Defaults to False. diff --git a/src/grag/components/vectordb/base.py b/src/grag/components/vectordb/base.py index 420a1b7..ee11c07 100644 --- a/src/grag/components/vectordb/base.py +++ b/src/grag/components/vectordb/base.py @@ -2,7 +2,7 @@ This module provides: -- VectorDB +— VectorDB """ from abc import ABC, abstractmethod @@ -56,7 +56,7 @@ async def aadd_docs(self, docs: List[Document], verbose: bool = True) -> None: @abstractmethod def get_chunk( - self, query: str, with_score: bool = False, top_k: Optional[int] = None + self, query: str, with_score: bool = False, top_k: Optional[int] = None ) -> Union[List[Document], List[Tuple[Document, float]]]: """Returns the most similar chunks from the vector database. @@ -72,7 +72,7 @@ def get_chunk( @abstractmethod async def aget_chunk( - self, query: str, with_score: bool = False, top_k: Optional[int] = None + self, query: str, with_score: bool = False, top_k: Optional[int] = None ) -> Union[List[Document], List[Tuple[Document, float]]]: """Returns the most similar chunks from the vector database (asynchronous). diff --git a/src/grag/components/vectordb/chroma_client.py b/src/grag/components/vectordb/chroma_client.py index 7047171..ddb80f2 100644 --- a/src/grag/components/vectordb/chroma_client.py +++ b/src/grag/components/vectordb/chroma_client.py @@ -2,7 +2,7 @@ This module provides: -- ChromaClient +— ChromaClient """ from typing import List, Optional, Tuple, Union @@ -24,7 +24,7 @@ class ChromaClient(VectorDB): Attributes: host : str IP Address of hosted Chroma Vectorstore - port : str + port : str or int port address of hosted Chroma Vectorstore collection_name : str name of the collection in the Chroma Vectorstore, each ChromaClient connects to a single collection @@ -44,20 +44,22 @@ class ChromaClient(VectorDB): def __init__( self, - host: str, - port: str, - collection_name: str, - embedding_type: str, - embedding_model: str, + host: str = 'localhost', + port: Union[str, int] = 8000, + collection_name: str = 'grag', + embedding_type: str = 'instructor-embedding', + embedding_model: str = 'hkunlp/instructor-xl', ): """Initialize a ChromaClient object. Args: - host: IP Address of hosted Chroma Vectorstore, defaults to argument from config file - port: port address of hosted Chroma Vectorstore, defaults to argument from config file - collection_name: name of the collection in the Chroma Vectorstore, defaults to argument from config file - embedding_type: type of embedding used, supported 'sentence-transformers' and 'instructor-embedding', defaults to argument from config file - embedding_model: model name of embedding used, should correspond to the embedding_type, defaults to argument from config file + host: IP Address of hosted Chroma Vectorstore, defaults to localhost + port: port address of hosted Chroma Vectorstore, defaults to 8000 + collection_name: name of the collection in the Chroma Vectorstore, defaults to 'grag' + embedding_type: type of embedding used, supported 'sentence-transformers' and 'instructor-embedding', + defaults to instructor-embedding + embedding_model: model name of embedding used, should correspond to the embedding_type, + defaults to hkunlp/instructor-xl. """ self.host = host self.port = port diff --git a/src/grag/components/vectordb/deeplake_client.py b/src/grag/components/vectordb/deeplake_client.py index 5c9cbf9..1eb57e1 100644 --- a/src/grag/components/vectordb/deeplake_client.py +++ b/src/grag/components/vectordb/deeplake_client.py @@ -2,7 +2,7 @@ This module provides: -- DeepLakeClient +— DeepLakeClient """ from pathlib import Path @@ -32,21 +32,31 @@ class DeepLakeClient(VectorDB): a function of the embedding model, derived from the embedding_type and embedding_modelname client: deeplake.core.vectorstore.VectorStore DeepLake API - collection - Chroma API for the collection + collection_name: str + The name of the collection where the vectors are stored. langchain_client: langchain_community.vectorstores.DeepLake - LangChain wrapper for DeepLake API + LangChain wrapper for DeepLake API. """ def __init__( self, - collection_name: str, - store_path: Union[str, Path], - embedding_type: str, - embedding_model: str, + store_path: Union[str, Path] = Path('data/vectordb'), + collection_name: str = 'grag', + embedding_type: str = 'instructor-embedding', + embedding_model: str = 'kunlp/instructor-xl', read_only: bool = False, ): - """Initialize DeepLake client object.""" + """Initialize a DeepLakeClient object. + + Args: + store_path: path to the deeplake vectorstore, defaults to 'data/vectordb' + collection_name: name of the collection in the DeepLake Vectorstore, defaults to 'grag' + embedding_type: type of embedding used, supported 'sentence-transformers' and 'instructor-embedding', + defaults to instructor-embedding + embedding_model: model name of embedding used, should correspond to the embedding_type, + defaults to hkunlp/instructor-xl + read_only: flag indicating whether the client is read-only, defaults to False. + """ self.store_path = Path(store_path) self.collection_name = collection_name self.read_only = read_only