From b3cad27e10e1faa9b3237ce0eeac756422d9fd60 Mon Sep 17 00:00:00 2001 From: bachvudinh Date: Tue, 16 Jul 2024 17:24:46 +0700 Subject: [PATCH] debug and correct version's dependency --- s3helper/s3_helper.py | 68 ++++++++++++++++++++++++++----------------- setup.py | 6 ++-- test_flows.py | 7 +++-- 3 files changed, 50 insertions(+), 31 deletions(-) diff --git a/s3helper/s3_helper.py b/s3helper/s3_helper.py index 10ffcf6..a493c14 100644 --- a/s3helper/s3_helper.py +++ b/s3helper/s3_helper.py @@ -5,8 +5,7 @@ import sys import logging from datasets import load_dataset, Dataset, load_from_disk -from typing import Optional, Dict, Any - +from typing import Optional, Dict, Any, List # Configure logging logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(message)s') @@ -81,9 +80,10 @@ def ensure_file_local(self, file_name_or_path: str, local_dir: str): os.makedirs(file_local_path, exist_ok=True) self.download_file(path_components, local_dir) else: - if 'model' in file_name_or_path: + if 'model' in local_dir.lower(): + logging.info(f"Model existed at: {file_local_path}, read from cache") - elif 'dataset' in file_name_or_path: + elif 'dataset' in local_dir.lower(): logging.info(f"Dataset existed at: {file_local_path}, read from cache") return file_local_path @@ -111,56 +111,72 @@ class S3HelperAutoModelForCausalLM(AutoModelForCausalLM): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir: str = './models', **kwargs): s3_helper = S3Helper.get_instance() - model_local_path = s3_helper.ensure_model_local(pretrained_model_name_or_path, local_dir) + model_local_path = s3_helper.ensure_file_local(pretrained_model_name_or_path, local_dir) return super().from_pretrained(model_local_path, *model_args, **kwargs) class S3HelperAutoTokenizer(AutoTokenizer): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir: str = './models', **kwargs): s3_helper = S3Helper.get_instance() - tokenizer_local_path = s3_helper.ensure_model_local(pretrained_model_name_or_path, local_dir) + tokenizer_local_path = s3_helper.ensure_file_local(pretrained_model_name_or_path, local_dir) return super().from_pretrained(tokenizer_local_path, *model_args, **kwargs) class S3HelperAutoConfig(AutoConfig): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir: str = './models', **kwargs): s3_helper = S3Helper.get_instance() - config_local_path = s3_helper.ensure_model_local(pretrained_model_name_or_path, local_dir) + config_local_path = s3_helper.ensure_file_local(pretrained_model_name_or_path, local_dir) return super().from_pretrained(config_local_path, *model_args, **kwargs) # defined a custom load_dataset from S3 bucket def s3_load_dataset( dataset_name_or_path: str, file_format: str = 'json', local_dir: str = './datasets', + split: str = None, *args: Any, **kwargs: Any ) -> Dataset: """ Load a dataset from S3/Minio storage. - Args: - path (str): Path to the dataset in the format 'bucket_name/dataset_name' - file_format: File format of the dataset. Either 'json' or 'csv' or 'parquet'. - local_dir (str): Local directory to store downloaded datasets - *args: Additional positional arguments to pass to load_dataset - **kwargs: Additional keyword arguments to pass to load_dataset - + dataset_name_or_path (str): Path to the dataset in the format 'bucket_name/dataset_name' + file_format (str): File format of the dataset. Either 'json', 'csv', or 'parquet'. + local_dir (str): Local directory to store downloaded datasets + split (str): Dataset split to load ('train', 'test', or None for all) + *args: Additional positional arguments to pass to load_dataset + **kwargs: Additional keyword arguments to pass to load_dataset Returns: - Dataset: The loaded dataset + Dataset: The loaded dataset """ s3_helper = S3Helper.get_instance() - # Split the path into bucket and dataset name - dataset_local_path = ensure_file_local(dataset_name_or_path, local_dir) + dataset_local_path = s3_helper.ensure_file_local(dataset_name_or_path, local_dir) + + def find_files(path: str, extension: str) -> List[str]: + return [os.path.join(root, file) for root, _, files in os.walk(path) + for file in files if file.endswith(f'.{extension}')] + local_files = find_files(dataset_local_path, file_format) - dataset_local_paths = [os.path.join(dataset_local_path, file) for file in local_files] - train_local_paths = [] - test_local_paths = [] - for file in dataset_local_paths: + logging.info(f"Found local files: {local_files}") + + data_files: Dict[str, List[str]] = {"train": [], "test": []} + for file in local_files: if "train" in file: - train_local_paths.append(file) + data_files["train"].append(file) elif "test" in file: - test_local_paths.append(file) + data_files["test"].append(file) else: - raise ValueError("Not Implemented") - # Load and return the dataset - return load_dataset(file_format, data_files={'train': train_local_paths, "test": test_local_paths}, *args, **kwargs) \ No newline at end of file + logging.warning(f"Unclassified file: {file}") + + if split: + if split not in data_files: + raise ValueError(f"Invalid split: {split}. Available splits are: {list(data_files.keys())}") + data_files = {split: data_files[split]} + + # Remove empty splits + data_files = {k: v for k, v in data_files.items() if v} + + if not data_files: + raise ValueError(f"No valid files found for the specified format and split.") + + logging.info(f"Loading dataset with data_files: {data_files}") + return load_dataset(file_format, data_files=data_files, *args, **kwargs) \ No newline at end of file diff --git a/setup.py b/setup.py index eb072c1..818a845 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name='research-utils', - version='0.2.0', # Increment the version number + version='0.2.1', # Increment the version number description='A helper library for working with S3/Minio, Hugging Face models, and datasets', long_description='This library provides utilities for downloading and managing machine learning models and datasets from S3-compatible storage services, and loading them using the Hugging Face libraries.', author='Alan', @@ -12,8 +12,10 @@ packages=find_packages(), install_requires=[ 'boto3', + # tokenizers >=0.13.3 + 'tokenizers==0.13.3', 'transformers', - 'datasets', # Add the datasets library + 'datasets==2.20.0', # Add the datasets library ], classifiers=[ 'Programming Language :: Python :: 3', diff --git a/test_flows.py b/test_flows.py index cbd8dbe..2031fc7 100644 --- a/test_flows.py +++ b/test_flows.py @@ -7,10 +7,11 @@ S3Helper() # # Example usage -# model_name = "thunghiem/tinyllama" +model_name = "jan-hq/tokenizer-tinyllama" # model = S3HelperAutoModelForCausalLM.from_pretrained(model_name) -# tokenizer = S3HelperAutoTokenizer.from_pretrained(model_name) +tokenizer = S3HelperAutoTokenizer.from_pretrained(model_name) +# print(tokenizer) # config = S3HelperAutoConfig.from_pretrained(model_name) # Make sure S3Helper is initialized and environment variables are set # Load a dataset from S3 bucket -dataset = s3_load_dataset("jan-hq/test_dataset",file_format='parquet', split='train') +dataset = s3_load_dataset("jan-hq/test-dataset",file_format='parquet', split='train')