From b3cad27e10e1faa9b3237ce0eeac756422d9fd60 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Tue, 16 Jul 2024 17:24:46 +0700
Subject: [PATCH] debug and correct version's dependency

---
 s3helper/s3_helper.py | 68 ++++++++++++++++++++++++++-----------------
 setup.py              |  6 ++--
 test_flows.py         |  7 +++--
 3 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/s3helper/s3_helper.py b/s3helper/s3_helper.py
index 10ffcf6..a493c14 100644
--- a/s3helper/s3_helper.py
+++ b/s3helper/s3_helper.py
@@ -5,8 +5,7 @@
 import sys
 import logging
 from datasets import load_dataset, Dataset, load_from_disk
-from typing import Optional, Dict, Any
-
+from typing import Optional, Dict, Any, List
 # Configure logging
 logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(message)s')
 
@@ -81,9 +80,10 @@ def ensure_file_local(self, file_name_or_path: str, local_dir: str):
             os.makedirs(file_local_path, exist_ok=True)
             self.download_file(path_components, local_dir)
         else:
-            if 'model' in file_name_or_path:
+            if 'model' in local_dir.lower():
+                
                 logging.info(f"Model existed at: {file_local_path}, read from cache")
-            elif 'dataset' in file_name_or_path:
+            elif 'dataset' in local_dir.lower():
                 logging.info(f"Dataset existed at: {file_local_path}, read from cache")
         return file_local_path
 
@@ -111,56 +111,72 @@ class S3HelperAutoModelForCausalLM(AutoModelForCausalLM):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir: str = './models', **kwargs):
         s3_helper = S3Helper.get_instance()
-        model_local_path = s3_helper.ensure_model_local(pretrained_model_name_or_path, local_dir)
+        model_local_path = s3_helper.ensure_file_local(pretrained_model_name_or_path, local_dir)
         return super().from_pretrained(model_local_path, *model_args, **kwargs)
 
 class S3HelperAutoTokenizer(AutoTokenizer):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir: str = './models', **kwargs):
         s3_helper = S3Helper.get_instance()
-        tokenizer_local_path = s3_helper.ensure_model_local(pretrained_model_name_or_path, local_dir)
+        tokenizer_local_path = s3_helper.ensure_file_local(pretrained_model_name_or_path, local_dir)
         return super().from_pretrained(tokenizer_local_path, *model_args, **kwargs)
 
 class S3HelperAutoConfig(AutoConfig):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir: str = './models', **kwargs):
         s3_helper = S3Helper.get_instance()
-        config_local_path = s3_helper.ensure_model_local(pretrained_model_name_or_path, local_dir)
+        config_local_path = s3_helper.ensure_file_local(pretrained_model_name_or_path, local_dir)
         return super().from_pretrained(config_local_path, *model_args, **kwargs)
 # defined a custom load_dataset from S3 bucket
 def s3_load_dataset(
     dataset_name_or_path: str,
     file_format: str = 'json',
     local_dir: str = './datasets',
+    split: str = None,
     *args: Any,
     **kwargs: Any
 ) -> Dataset:
     """
     Load a dataset from S3/Minio storage.
-
     Args:
-        path (str): Path to the dataset in the format 'bucket_name/dataset_name'
-        file_format: File format of the dataset. Either 'json' or 'csv' or 'parquet'.
-        local_dir (str): Local directory to store downloaded datasets
-        *args: Additional positional arguments to pass to load_dataset
-        **kwargs: Additional keyword arguments to pass to load_dataset
-
+    dataset_name_or_path (str): Path to the dataset in the format 'bucket_name/dataset_name'
+    file_format (str): File format of the dataset. Either 'json', 'csv', or 'parquet'.
+    local_dir (str): Local directory to store downloaded datasets
+    split (str): Dataset split to load ('train', 'test', or None for all)
+    *args: Additional positional arguments to pass to load_dataset
+    **kwargs: Additional keyword arguments to pass to load_dataset
     Returns:
-        Dataset: The loaded dataset
+    Dataset: The loaded dataset
     """
     s3_helper = S3Helper.get_instance()
-    # Split the path into bucket and dataset name
-    dataset_local_path = ensure_file_local(dataset_name_or_path, local_dir)
+    dataset_local_path = s3_helper.ensure_file_local(dataset_name_or_path, local_dir)
+    
+    def find_files(path: str, extension: str) -> List[str]:
+        return [os.path.join(root, file) for root, _, files in os.walk(path) 
+                for file in files if file.endswith(f'.{extension}')]
+    
     local_files = find_files(dataset_local_path, file_format)
-    dataset_local_paths = [os.path.join(dataset_local_path, file) for file in local_files]
-    train_local_paths = []
-    test_local_paths = []   
-    for file in dataset_local_paths:
+    logging.info(f"Found local files: {local_files}")
+    
+    data_files: Dict[str, List[str]] = {"train": [], "test": []}
+    for file in local_files:
         if "train" in file:
-            train_local_paths.append(file)
+            data_files["train"].append(file)
         elif "test" in file:
-            test_local_paths.append(file)
+            data_files["test"].append(file)
         else:
-            raise ValueError("Not Implemented")
-    # Load and return the dataset
-    return load_dataset(file_format, data_files={'train': train_local_paths, "test": test_local_paths}, *args, **kwargs)
\ No newline at end of file
+            logging.warning(f"Unclassified file: {file}")
+    
+    if split:
+        if split not in data_files:
+            raise ValueError(f"Invalid split: {split}. Available splits are: {list(data_files.keys())}")
+        data_files = {split: data_files[split]}
+    
+    # Remove empty splits
+    data_files = {k: v for k, v in data_files.items() if v}
+    
+    if not data_files:
+        raise ValueError(f"No valid files found for the specified format and split.")
+    
+    logging.info(f"Loading dataset with data_files: {data_files}")
+    return load_dataset(file_format, data_files=data_files, *args, **kwargs)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index eb072c1..818a845 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 
 setup(
     name='research-utils',
-    version='0.2.0',  # Increment the version number
+    version='0.2.1',  # Increment the version number
     description='A helper library for working with S3/Minio, Hugging Face models, and datasets',
     long_description='This library provides utilities for downloading and managing machine learning models and datasets from S3-compatible storage services, and loading them using the Hugging Face libraries.',
     author='Alan',
@@ -12,8 +12,10 @@
     packages=find_packages(),
     install_requires=[
         'boto3',
+        # tokenizers >=0.13.3
+        'tokenizers==0.13.3',
         'transformers',
-        'datasets',  # Add the datasets library
+        'datasets==2.20.0',  # Add the datasets library
     ],
     classifiers=[
         'Programming Language :: Python :: 3',
diff --git a/test_flows.py b/test_flows.py
index cbd8dbe..2031fc7 100644
--- a/test_flows.py
+++ b/test_flows.py
@@ -7,10 +7,11 @@
 S3Helper()
 
 # # Example usage
-# model_name = "thunghiem/tinyllama"
+model_name = "jan-hq/tokenizer-tinyllama"
 # model = S3HelperAutoModelForCausalLM.from_pretrained(model_name)
-# tokenizer = S3HelperAutoTokenizer.from_pretrained(model_name)
+tokenizer = S3HelperAutoTokenizer.from_pretrained(model_name)
+# print(tokenizer)
 # config = S3HelperAutoConfig.from_pretrained(model_name)
 # Make sure S3Helper is initialized and environment variables are set
 # Load a dataset from S3 bucket
-dataset = s3_load_dataset("jan-hq/test_dataset",file_format='parquet', split='train')
+dataset = s3_load_dataset("jan-hq/test-dataset",file_format='parquet', split='train')