Skip to content

Commit

Permalink
debug and correct version's dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
bachvudinh committed Jul 16, 2024
1 parent 4a311d0 commit b3cad27
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 31 deletions.
68 changes: 42 additions & 26 deletions s3helper/s3_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
import sys
import logging
from datasets import load_dataset, Dataset, load_from_disk
from typing import Optional, Dict, Any

from typing import Optional, Dict, Any, List
# Configure logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(message)s')

Expand Down Expand Up @@ -81,9 +80,10 @@ def ensure_file_local(self, file_name_or_path: str, local_dir: str):
os.makedirs(file_local_path, exist_ok=True)
self.download_file(path_components, local_dir)
else:
if 'model' in file_name_or_path:
if 'model' in local_dir.lower():

logging.info(f"Model existed at: {file_local_path}, read from cache")
elif 'dataset' in file_name_or_path:
elif 'dataset' in local_dir.lower():
logging.info(f"Dataset existed at: {file_local_path}, read from cache")
return file_local_path

Expand Down Expand Up @@ -111,56 +111,72 @@ class S3HelperAutoModelForCausalLM(AutoModelForCausalLM):
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir: str = './models', **kwargs):
s3_helper = S3Helper.get_instance()
model_local_path = s3_helper.ensure_model_local(pretrained_model_name_or_path, local_dir)
model_local_path = s3_helper.ensure_file_local(pretrained_model_name_or_path, local_dir)
return super().from_pretrained(model_local_path, *model_args, **kwargs)

class S3HelperAutoTokenizer(AutoTokenizer):
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir: str = './models', **kwargs):
s3_helper = S3Helper.get_instance()
tokenizer_local_path = s3_helper.ensure_model_local(pretrained_model_name_or_path, local_dir)
tokenizer_local_path = s3_helper.ensure_file_local(pretrained_model_name_or_path, local_dir)
return super().from_pretrained(tokenizer_local_path, *model_args, **kwargs)

class S3HelperAutoConfig(AutoConfig):
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir: str = './models', **kwargs):
s3_helper = S3Helper.get_instance()
config_local_path = s3_helper.ensure_model_local(pretrained_model_name_or_path, local_dir)
config_local_path = s3_helper.ensure_file_local(pretrained_model_name_or_path, local_dir)
return super().from_pretrained(config_local_path, *model_args, **kwargs)
# defined a custom load_dataset from S3 bucket
def s3_load_dataset(
dataset_name_or_path: str,
file_format: str = 'json',
local_dir: str = './datasets',
split: str = None,
*args: Any,
**kwargs: Any
) -> Dataset:
"""
Load a dataset from S3/Minio storage.
Args:
path (str): Path to the dataset in the format 'bucket_name/dataset_name'
file_format: File format of the dataset. Either 'json' or 'csv' or 'parquet'.
local_dir (str): Local directory to store downloaded datasets
*args: Additional positional arguments to pass to load_dataset
**kwargs: Additional keyword arguments to pass to load_dataset
dataset_name_or_path (str): Path to the dataset in the format 'bucket_name/dataset_name'
file_format (str): File format of the dataset. Either 'json', 'csv', or 'parquet'.
local_dir (str): Local directory to store downloaded datasets
split (str): Dataset split to load ('train', 'test', or None for all)
*args: Additional positional arguments to pass to load_dataset
**kwargs: Additional keyword arguments to pass to load_dataset
Returns:
Dataset: The loaded dataset
Dataset: The loaded dataset
"""
s3_helper = S3Helper.get_instance()
# Split the path into bucket and dataset name
dataset_local_path = ensure_file_local(dataset_name_or_path, local_dir)
dataset_local_path = s3_helper.ensure_file_local(dataset_name_or_path, local_dir)

def find_files(path: str, extension: str) -> List[str]:
return [os.path.join(root, file) for root, _, files in os.walk(path)
for file in files if file.endswith(f'.{extension}')]

local_files = find_files(dataset_local_path, file_format)
dataset_local_paths = [os.path.join(dataset_local_path, file) for file in local_files]
train_local_paths = []
test_local_paths = []
for file in dataset_local_paths:
logging.info(f"Found local files: {local_files}")

data_files: Dict[str, List[str]] = {"train": [], "test": []}
for file in local_files:
if "train" in file:
train_local_paths.append(file)
data_files["train"].append(file)
elif "test" in file:
test_local_paths.append(file)
data_files["test"].append(file)
else:
raise ValueError("Not Implemented")
# Load and return the dataset
return load_dataset(file_format, data_files={'train': train_local_paths, "test": test_local_paths}, *args, **kwargs)
logging.warning(f"Unclassified file: {file}")

if split:
if split not in data_files:
raise ValueError(f"Invalid split: {split}. Available splits are: {list(data_files.keys())}")
data_files = {split: data_files[split]}

# Remove empty splits
data_files = {k: v for k, v in data_files.items() if v}

if not data_files:
raise ValueError(f"No valid files found for the specified format and split.")

logging.info(f"Loading dataset with data_files: {data_files}")
return load_dataset(file_format, data_files=data_files, *args, **kwargs)
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

setup(
name='research-utils',
version='0.2.0', # Increment the version number
version='0.2.1', # Increment the version number
description='A helper library for working with S3/Minio, Hugging Face models, and datasets',
long_description='This library provides utilities for downloading and managing machine learning models and datasets from S3-compatible storage services, and loading them using the Hugging Face libraries.',
author='Alan',
Expand All @@ -12,8 +12,10 @@
packages=find_packages(),
install_requires=[
'boto3',
# tokenizers >=0.13.3
'tokenizers==0.13.3',
'transformers',
'datasets', # Add the datasets library
'datasets==2.20.0', # Add the datasets library
],
classifiers=[
'Programming Language :: Python :: 3',
Expand Down
7 changes: 4 additions & 3 deletions test_flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
S3Helper()

# # Example usage
# model_name = "thunghiem/tinyllama"
model_name = "jan-hq/tokenizer-tinyllama"
# model = S3HelperAutoModelForCausalLM.from_pretrained(model_name)
# tokenizer = S3HelperAutoTokenizer.from_pretrained(model_name)
tokenizer = S3HelperAutoTokenizer.from_pretrained(model_name)
# print(tokenizer)
# config = S3HelperAutoConfig.from_pretrained(model_name)
# Make sure S3Helper is initialized and environment variables are set
# Load a dataset from S3 bucket
dataset = s3_load_dataset("jan-hq/test_dataset",file_format='parquet', split='train')
dataset = s3_load_dataset("jan-hq/test-dataset",file_format='parquet', split='train')

0 comments on commit b3cad27

Please sign in to comment.