Skip to content

Commit

Permalink
daily update
Browse files Browse the repository at this point in the history
  • Loading branch information
louisbrulenaudet committed Aug 5, 2024
1 parent 9fd515c commit 1c9a9db
Show file tree
Hide file tree
Showing 12 changed files with 244 additions and 41 deletions.
1 change: 1 addition & 0 deletions docs/source/autosummary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Autosummary

ragoon
ragoon.chunks
ragoon.datasets
ragoon.embeddings
ragoon.similarity_search
ragoon.web_rag
1 change: 1 addition & 0 deletions docs/source/modules.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ ragoon
:glob:

api/ragoon.chunks
api/ragoon.datasets
api/ragoon.embeddings
api/ragoon.similarity_search
api/ragoon.web_rag
Expand Down
1 change: 1 addition & 0 deletions docs/source/ragoon.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Submodules
:caption: Submodules

api/ragoon.chunks
api/ragoon.datasets
api/ragoon.embeddings
api/ragoon.similarity_search
api/ragoon.web_rag
22 changes: 22 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ install_requires = [
"numpy<2",
"numpydoc==1.7.0",
"openai==1.37.1",
"overload==1.1",
"plotly==5.23.0",
"pydata-sphinx-theme==0.15.4",
"pytest==8.3.2",
Expand All @@ -53,6 +54,27 @@ testpaths = [
"tests"
]

[tool.ruff]
line-length = 88
indent-width = 4

[tool.ruff.format]
quote-style = "double"
indent-style = "space"
skip-magic-trailing-comma = false
line-ending = "auto"

[tool.ruff.lint.pydocstyle]
convention = "numpy"

[tool.ruff.lint.pycodestyle]
max-line-length = 88

[tool.ruff.lint.flake8-quotes]
docstring-quotes = "double"
inline-quotes = "double"
multiline-quotes = "double"

[project.optional-dependencies]
docs = [
"sphinx>=6.0.0",
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ myst_parser==3.0.1
numpy<2
numpydoc==1.7.0
openai==1.37.1
overload==1.1
plotly==5.23.0
pydata-sphinx-theme==0.15.4
pytest==8.3.2
Expand Down
2 changes: 1 addition & 1 deletion src/ragoon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ragoon.similarity_search import SimilaritySearch
from ragoon.web_rag import WebRAG

from ragoon._dataset import (
from ragoon.dataset import (
dataset_loader,
load_datasets
)
7 changes: 5 additions & 2 deletions src/ragoon/chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import re
import string
import uuid

from concurrent.futures import (
ThreadPoolExecutor,
as_completed
)

from dataclasses import dataclass
from typing import (
IO,
Expand All @@ -42,6 +41,10 @@
from tqdm import tqdm
from transformers import AutoTokenizer

from ragoon._logger import Logger

logger = Logger()


@dataclass
class ChunkMetadata:
Expand Down
55 changes: 37 additions & 18 deletions src/ragoon/_dataset.py → src/ragoon/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
# limitations under the License.

import concurrent.futures
import os

import datasets

from typing import (
IO,
Expand All @@ -27,16 +28,18 @@
Sequence,
)

import datasets
from datasets import load_dataset
from tqdm import tqdm

from ragoon._logger import Logger

logger = Logger()


def dataset_loader(
name: str,
streaming: bool=True
name: str,
streaming: Optional[bool] = True,
split: Optional[Union[str, List[str]]] = None
) -> datasets.Dataset:
"""
Helper function to load a single dataset in parallel.
Expand All @@ -49,6 +52,9 @@ def dataset_loader(
streaming : bool, optional
Determines if datasets are streamed. Default is True.
split : Optional[Union[str, List[str]]], optional
Which split of the data to load. If None, will return a dict with all splits (typically datasets.Split.TRAIN and datasets.Split.TEST). If given, will return a single Dataset. Splits can be combined and specified like in tensorflow-datasets.
Returns
-------
dataset : datasets.Dataset
Expand All @@ -59,13 +65,11 @@ def dataset_loader(
Exception
If an error occurs during dataset loading.
"""
global logger

try:
return datasets.load_dataset(
name,
split="train",
streaming=streaming
return load_dataset(
name,
streaming=streaming,
split=split
)

except Exception as exc:
Expand All @@ -76,7 +80,7 @@ def dataset_loader(

def load_datasets(
req: list,
streaming: bool=True
streaming: Optional[bool] = False,
) -> list:
"""
Downloads datasets specified in a list and creates a list of loaded datasets.
Expand All @@ -87,7 +91,7 @@ def load_datasets(
A list containing the names of datasets to be downloaded.
streaming : bool, optional
Determines if datasets are streamed. Default is True.
Determines if datasets are streamed. Default is False.
Returns
-------
Expand All @@ -101,16 +105,31 @@ def load_datasets(
Examples
--------
>>> datasets = load_datasets(["dataset1", "dataset2"], streaming=False)
>>> req = [
... "louisbrulenaudet/code-artisanat",
... "louisbrulenaudet/code-action-sociale-familles",
... # ...
]
>>> datasets_list = load_datasets(
... req=req,
... streaming=True
)
>>> dataset = datasets.concatenate_datasets(
... datasets_list
)
"""
global logger

datasets_list: str = []
datasets_list = []

with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_dataset = {executor.submit(dataset_loader, name, streaming): name for name in req}
future_to_dataset = {
executor.submit(dataset_loader, name, streaming): name for name in req
}

for future in tqdm(concurrent.futures.as_completed(future_to_dataset), total=len(req)):
for future in tqdm(
concurrent.futures.as_completed(future_to_dataset), total=len(req)
):
name = future_to_dataset[future]

try:
Expand Down
5 changes: 0 additions & 5 deletions src/ragoon/similarity_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import concurrent.futures
import os

from typing import (
IO,
TYPE_CHECKING,
Expand All @@ -27,13 +24,11 @@
Sequence,
)

import datasets
import faiss
import numpy as np

from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings
from tqdm.notebook import tqdm
from usearch.index import Index

from ragoon._logger import Logger
Expand Down
31 changes: 21 additions & 10 deletions src/ragoon/web_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from groq import Groq
from openai import OpenAI
from typing import (
IO,
TYPE_CHECKING,
Any,
Dict,
List,
Type,
Tuple,
Union,
Mapping,
TypeVar,
Callable,
Optional,
Sequence,
)

from ragoon._retrieval import Retriever
from ragoon._scrape import WebScraper
Expand All @@ -21,10 +32,10 @@
class WebRAG:
def __init__(
self,
google_api_key:str,
google_cx:str,
google_api_key: str,
google_cx: str,
completion_client,
user_agent:str="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
user_agent: Optional[str] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
) -> None:
"""
WebRAG class.
Expand Down Expand Up @@ -88,9 +99,9 @@ def __init__(

def search(
self,
query:str,
completion_model:str,
system_prompt:str="""
query: str,
completion_model: str,
system_prompt: Optional[str] = """
Given the user's input query, generate a concise and relevant Google search
query that directly addresses the main intent of the user's question. The search query must
be specifically tailored to retrieve results that can significantly enhance the context for a
Expand Down
6 changes: 1 addition & 5 deletions tests/test_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,11 @@
)

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer

from src.ragoon import (
ChunkMetadata,
DatasetChunker
)
from dataset_chunker import DatasetChunker, ChunkMetadata


@pytest.fixture(scope="module")
Expand All @@ -50,9 +48,7 @@ def dataset():


@pytest.fixture(scope="module")
def chunker(
dataset
):
def chunker(dataset):
"""
Fixture to initialize the DatasetChunker with example parameters.
Expand Down
Loading

0 comments on commit 1c9a9db

Please sign in to comment.