arjbingly · arjbingly · Mar 25, 2024 · Mar 16, 2024 · Mar 17, 2024 · Mar 18, 2024
diff --git a/.github/workflows/ruff_commit.yml b/.github/workflows/ruff_commit.yml
@@ -0,0 +1,15 @@
+name: Ruff and commit
+on: push
+
+jobs:
+  lint:
+    runs-on: self-hosted
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
+    - run: pip install ruff
+    # - run: ruff check src/
+    - run: ruff format src/
+    - uses: stefanzweifel/git-auto-commit-action@v4
+      with:
+        commit_message: 'style fixes by ruff'
diff --git a/.github/workflows/ruff_linting.yml b/.github/workflows/ruff_linting.yml
@@ -0,0 +1,25 @@
+name: Ruff Linting
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  adopt-ruff:
+    runs-on: self-hosted
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Set up python
+        id: setup-python
+        uses: actions/setup-python@v5
+        with: 
+          python-version: 3.x
+
+      - name: Install ruff 
+        run: pip install ruff
+
+      - name: Run the adopt-ruff action
+        uses: chartboost/ruff-action@v1
+
diff --git a/LICENSE b/LICENSE
diff --git a/llm_quantize/quantize.py b/llm_quantize/quantize.py
@@ -1,6 +1,6 @@
+import os
 import subprocess
 import sys
-import os
 
 
 def execute_commands(model_dir_path, quantization=None):
@@ -13,7 +13,7 @@ def execute_commands(model_dir_path, quantization=None):
         if quantization:
             model_file = f"llama.cpp/models/{model_dir_path}/ggml-model-f16.gguf"
             quantized_model_file = f"llama.cpp/models/{model_dir_path.split('/')[-1]}/ggml-model-{quantization}.gguf"
-            subprocess.run(["llama.cpp/llm_quantize", model_file, quantized_model_file, quantization], check=True)
+            subprocess.run(["llama.cpp/quantize", model_file, quantized_model_file, quantization], check=True)
 
     else:
         print("llama.cpp doesn't exist, check readme how to clone.")

diff --git a/projects/Basic-RAG/BasicRAG_stuff.py b/projects/Basic-RAG/BasicRAG_stuff.py
@@ -1,6 +1,10 @@
-from grag.grag.rag import BasicRAG
+from grag.components.multivec_retriever import Retriever
+from grag.components.vectordb.deeplake_client import DeepLakeClient
+from grag.rag.basic_rag import BasicRAG
 
-rag = BasicRAG(doc_chain="stuff")
+client = DeepLakeClient(collection_name="test")
+retriever = Retriever(vectordb=client)
+rag = BasicRAG(doc_chain="stuff", retriever=retriever)
 
 if __name__ == "__main__":
     while True:

diff --git a/projects/Retriver-GUI/retriever_app.py b/projects/Retriver-GUI/retriever_app.py
@@ -46,7 +46,7 @@ def render_search_results(self):
                     st.write(result.metadata)
 
     def check_connection(self):
-        response = self.app.retriever.client.test_connection()
+        response = self.app.retriever.vectordb.test_connection()
         if response:
             return True
         else:
@@ -55,14 +55,14 @@ def check_connection(self):
     def render_stats(self):
         st.write(f'''
         **Chroma Client Details:** \n
-            Host Address    : {self.app.retriever.client.host}:{self.app.retriever.client.port} \n
-            Collection Name : {self.app.retriever.client.collection_name} \n
-            Embeddings Type : {self.app.retriever.client.embedding_type} \n
-            Embeddings Model: {self.app.retriever.client.embedding_model} \n
-            Number of docs  : {self.app.retriever.client.collection.count()} \n
+            Host Address    : {self.app.retriever.vectordb.host}:{self.app.retriever.vectordb.port} \n
+            Collection Name : {self.app.retriever.vectordb.collection_name} \n
+            Embeddings Type : {self.app.retriever.vectordb.embedding_type} \n
+            Embeddings Model: {self.app.retriever.vectordb.embedding_model} \n
+            Number of docs  : {self.app.retriever.vectordb.collection.count()} \n
         ''')
         if st.button('Check Connection'):
-            response = self.app.retriever.client.test_connection()
+            response = self.app.retriever.vectordb.test_connection()
             if response:
                 st.write(':green[Connection Active]')
             else:

diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,7 @@ dependencies = [
     "huggingface_hub>=0.20.2",
     "pydantic>=2.5.0",
     "rouge-score>=0.1.2",
+    "deeplake>=3.8.27"
 ]
 
 [project.urls]
@@ -97,3 +98,22 @@ exclude_lines = [
     "if __name__ == .__main__.:",
     "if TYPE_CHECKING:",
 ]
+
+[tool.ruff]
+line-length = 88
+indent-width = 4
+extend-exclude = ["tests", "others"]
+
+[tool.ruff.lint]
+select = ["E4", "E7", "E9", "F", "I", "D"]
+ignore = ["D104"]
+exclude = ["__about__.py"]
+
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+docstring-code-format = true
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/config.ini b/src/config.ini
@@ -1,18 +1,25 @@
 [llm]
-model_name : Llama-2-13b-chat
+model_name : Llama-2-7b-chat
 # meta-llama/Llama-2-70b-chat-hf Mixtral-8x7B-Instruct-v0.1
 quantization : Q5_K_M
+pipeline : llama_cpp
 device_map : auto
 task : text-generation
 max_new_tokens : 1024
 temperature : 0.1
 n_batch_gpu_cpp : 1024
 n_ctx_cpp : 6000
-n_gpu_layers_cpp : 18
+n_gpu_layers_cpp : -1
 # The number of layers to put on the GPU. Mixtral-18
 std_out : True
 base_dir : ${root:root_path}/models
 
+[deeplake]
+collection_name : arxiv
+embedding_type : instructor-embedding
+embedding_model : hkunlp/instructor-xl
+store_path : ${data:data_path}/vectordb
+
 [chroma]
 host : localhost
 port : 8000
@@ -24,6 +31,14 @@ embedding_model : hkunlp/instructor-xl
 store_path : ${data:data_path}/vectordb
 allow_reset : True
 
+[deeplake]
+collection_name : arxiv
+# embedding_type : sentence-transformers
+# embedding_model : "all-mpnet-base-v2"
+embedding_type : instructor-embedding
+embedding_model : hkunlp/instructor-xl
+store_path : ${data:data_path}/vectordb
+
 [text_splitter]
 chunk_size : 5000
 chunk_overlap : 400
@@ -50,4 +65,7 @@ table_as_html : True
 data_path : ${root:root_path}/data
 
 [root]
-root_path : /home/ubuntu/volume_2k/Capstone_5
+root_path : /home/ubuntu/volume_2k/Capstone_5
+
+[quantize]
+llama_cpp_path : ${root:root_path}
diff --git a/src/grag/components/chroma_client.py b/src/grag/components/chroma_client.py
diff --git a/src/grag/components/embedding.py b/src/grag/components/embedding.py
@@ -1,10 +1,18 @@
+"""Class for embedding.
+
+This module provides:
+- Embedding
+"""
+
 from langchain_community.embeddings import HuggingFaceInstructEmbeddings
-from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+from langchain_community.embeddings.sentence_transformer import (
+    SentenceTransformerEmbeddings,
+)
 
 
 class Embedding:
-    """
-    A class for vector embeddings.
+    """A class for vector embeddings.
+
     Supports:
         huggingface sentence transformers -> model_type = 'sentence-transformers'
         huggingface instructor embeddings -> model_type = 'instructor-embedding'
@@ -16,14 +24,19 @@ class Embedding:
     """
 
     def __init__(self, embedding_type: str, embedding_model: str):
+        """Initialize the embedding with embedding_type and embedding_model."""
         self.embedding_type = embedding_type
         self.embedding_model = embedding_model
         match self.embedding_type:
-            case 'sentence-transformers':
-                self.embedding_function = SentenceTransformerEmbeddings(model_name=self.embedding_model)
-            case 'instructor-embedding':
-                self.embedding_instruction = 'Represent the document for retrival'
-                self.embedding_function = HuggingFaceInstructEmbeddings(model_name=self.embedding_model)
+            case "sentence-transformers":
+                self.embedding_function = SentenceTransformerEmbeddings(
+                    model_name=self.embedding_model
+                )
+            case "instructor-embedding":
+                self.embedding_instruction = "Represent the document for retrival"
+                self.embedding_function = HuggingFaceInstructEmbeddings(
+                    model_name=self.embedding_model
+                )
                 self.embedding_function.embed_instruction = self.embedding_instruction
             case _:
-                raise Exception('embedding_type is invalid')
+                raise Exception("embedding_type is invalid")