daily update

louisbrulenaudet · Aug 5, 2024 · 1c9a9db · 1c9a9db
1 parent 9fd515c
commit 1c9a9db
Show file tree

Hide file tree

Showing 12 changed files with 244 additions and 41 deletions.
diff --git a/docs/source/autosummary.rst b/docs/source/autosummary.rst
@@ -6,6 +6,7 @@ Autosummary
 
    ragoon
    ragoon.chunks
+   ragoon.datasets
    ragoon.embeddings
    ragoon.similarity_search
    ragoon.web_rag
diff --git a/docs/source/modules.rst b/docs/source/modules.rst
@@ -7,6 +7,7 @@ ragoon
     :glob:
 
     api/ragoon.chunks
+    api/ragoon.datasets
     api/ragoon.embeddings
     api/ragoon.similarity_search
     api/ragoon.web_rag

diff --git a/docs/source/ragoon.rst b/docs/source/ragoon.rst
@@ -15,6 +15,7 @@ Submodules
    :caption: Submodules
 
    api/ragoon.chunks
+   api/ragoon.datasets
    api/ragoon.embeddings
    api/ragoon.similarity_search
    api/ragoon.web_rag
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,7 @@ install_requires = [
     "numpy<2",
     "numpydoc==1.7.0",
     "openai==1.37.1",
+    "overload==1.1",
     "plotly==5.23.0",
     "pydata-sphinx-theme==0.15.4",
     "pytest==8.3.2",
@@ -53,6 +54,27 @@ testpaths = [
     "tests"
 ]
 
+[tool.ruff]
+line-length = 88
+indent-width = 4
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"
+
+[tool.ruff.lint.pycodestyle]
+max-line-length = 88
+
+[tool.ruff.lint.flake8-quotes]
+docstring-quotes = "double"
+inline-quotes = "double"
+multiline-quotes = "double"
+
 [project.optional-dependencies]
 docs = [
     "sphinx>=6.0.0",

diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,7 @@ myst_parser==3.0.1
 numpy<2
 numpydoc==1.7.0
 openai==1.37.1
+overload==1.1
 plotly==5.23.0
 pydata-sphinx-theme==0.15.4
 pytest==8.3.2

diff --git a/src/ragoon/__init__.py b/src/ragoon/__init__.py
@@ -17,7 +17,7 @@
 from ragoon.similarity_search import SimilaritySearch
 from ragoon.web_rag import WebRAG
 
-from ragoon._dataset import (
+from ragoon.dataset import (
 	dataset_loader,
 	load_datasets
 )
diff --git a/src/ragoon/chunks.py b/src/ragoon/chunks.py
@@ -8,15 +8,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json 
 import re
-import string
 import uuid
 
 from concurrent.futures import (
     ThreadPoolExecutor, 
     as_completed
 )
+
 from dataclasses import dataclass
 from typing import (
     IO,
@@ -42,6 +41,10 @@
 from tqdm import tqdm
 from transformers import AutoTokenizer
 
+from ragoon._logger import Logger
+
+logger = Logger()
+
 
 @dataclass
 class ChunkMetadata:

diff --git a/src/ragoon/_dataset.py → src/ragoon/datasets.py b/src/ragoon/_dataset.py → src/ragoon/datasets.py
@@ -9,7 +9,8 @@
 # limitations under the License.
 
 import concurrent.futures
-import os
+
+import datasets
 
 from typing import (
     IO,
@@ -27,16 +28,18 @@
     Sequence,
 )
 
-import datasets
+from datasets import load_dataset
+from tqdm import tqdm
 
 from ragoon._logger import Logger
 
 logger = Logger()
 
 
 def dataset_loader(
-    name: str,
-    streaming: bool=True
+    name: str, 
+    streaming: Optional[bool] = True,
+    split: Optional[Union[str, List[str]]] = None
 ) -> datasets.Dataset:
     """
     Helper function to load a single dataset in parallel.
@@ -49,6 +52,9 @@ def dataset_loader(
     streaming : bool, optional
         Determines if datasets are streamed. Default is True.
 
+    split : Optional[Union[str, List[str]]], optional
+        Which split of the data to load. If None, will return a dict with all splits (typically datasets.Split.TRAIN and datasets.Split.TEST). If given, will return a single Dataset. Splits can be combined and specified like in tensorflow-datasets.
+
     Returns
     -------
     dataset : datasets.Dataset
@@ -59,13 +65,11 @@ def dataset_loader(
     Exception
         If an error occurs during dataset loading.
     """
-    global logger
-
     try:
-        return datasets.load_dataset(
-            name,
-            split="train",
-            streaming=streaming
+        return load_dataset(
+            name, 
+            streaming=streaming,
+            split=split
         )
 
     except Exception as exc:
@@ -76,7 +80,7 @@ def dataset_loader(
 
 def load_datasets(
     req: list,
-    streaming: bool=True
+    streaming: Optional[bool] = False,
 ) -> list:
     """
     Downloads datasets specified in a list and creates a list of loaded datasets.
@@ -87,7 +91,7 @@ def load_datasets(
         A list containing the names of datasets to be downloaded.
 
     streaming : bool, optional
-        Determines if datasets are streamed. Default is True.
+        Determines if datasets are streamed. Default is False.
 
     Returns
     -------
@@ -101,16 +105,31 @@ def load_datasets(
 
     Examples
     --------
-    >>> datasets = load_datasets(["dataset1", "dataset2"], streaming=False)
+    >>> req = [
+    ...    "louisbrulenaudet/code-artisanat",
+    ...    "louisbrulenaudet/code-action-sociale-familles",
+    ... # ...
+    ]
+
+    >>> datasets_list = load_datasets(
+    ...    req=req,
+    ...    streaming=True
+    )
+
+    >>> dataset = datasets.concatenate_datasets(
+    ...    datasets_list
+    )
     """
-    global logger
-
-    datasets_list: str = []
+    datasets_list = []
 
     with concurrent.futures.ThreadPoolExecutor() as executor:
-        future_to_dataset = {executor.submit(dataset_loader, name, streaming): name for name in req}
+        future_to_dataset = {
+            executor.submit(dataset_loader, name, streaming): name for name in req
+        }
 
-        for future in tqdm(concurrent.futures.as_completed(future_to_dataset), total=len(req)):
+        for future in tqdm(
+            concurrent.futures.as_completed(future_to_dataset), total=len(req)
+        ):
             name = future_to_dataset[future]
 
             try:

diff --git a/src/ragoon/similarity_search.py b/src/ragoon/similarity_search.py
@@ -8,9 +8,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import concurrent.futures
-import os
-
 from typing import (
     IO,
     TYPE_CHECKING,
@@ -27,13 +24,11 @@
     Sequence,
 )
 
-import datasets
 import faiss
 import numpy as np
 
 from sentence_transformers import SentenceTransformer
 from sentence_transformers.quantization import quantize_embeddings
-from tqdm.notebook import tqdm
 from usearch.index import Index
 
 from ragoon._logger import Logger

diff --git a/src/ragoon/web_rag.py b/src/ragoon/web_rag.py
@@ -8,10 +8,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-
-from groq import Groq
-from openai import OpenAI
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    List,
+    Type,
+    Tuple,
+    Union,
+    Mapping,
+    TypeVar,
+    Callable,
+    Optional,
+    Sequence,
+)
 
 from ragoon._retrieval import Retriever
 from ragoon._scrape import WebScraper
@@ -21,10 +32,10 @@
 class WebRAG:
     def __init__(
         self,
-        google_api_key:str,
-        google_cx:str,
+        google_api_key: str,
+        google_cx: str,
         completion_client,
-        user_agent:str="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
+        user_agent: Optional[str] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
     ) -> None:
         """
         WebRAG class.
@@ -88,9 +99,9 @@ def __init__(
 
     def search(
         self,
-        query:str,
-        completion_model:str,
-        system_prompt:str="""
+        query: str,
+        completion_model: str,
+        system_prompt: Optional[str] = """
         Given the user's input query, generate a concise and relevant Google search
         query that directly addresses the main intent of the user's question. The search query must
         be specifically tailored to retrieve results that can significantly enhance the context for a

diff --git a/tests/test_chunks.py b/tests/test_chunks.py
@@ -27,13 +27,11 @@
 )
 
 from datasets import load_dataset, Dataset
-from transformers import AutoTokenizer
 
 from src.ragoon import (
     ChunkMetadata,
     DatasetChunker
 )
-from dataset_chunker import DatasetChunker, ChunkMetadata
 
 
 @pytest.fixture(scope="module")
@@ -50,9 +48,7 @@ def dataset():
 
 
 @pytest.fixture(scope="module")
-def chunker(
-    dataset
-):
+def chunker(dataset):
     """
     Fixture to initialize the DatasetChunker with example parameters.