From 4a311d058b4f59efd7209ecf62ec9708580e70a3 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Tue, 16 Jul 2024 09:50:33 +0000
Subject: [PATCH 01/13] refactor the code

---
 .github/workflows/test-flow.yml |   0
 example_env.sh                  |   3 +-
 s3helper/s3_helper.py           | 107 +++++++++++++++++---------------
 main.py => test_flows.py        |  13 +---
 4 files changed, 63 insertions(+), 60 deletions(-)
 create mode 100644 .github/workflows/test-flow.yml
 rename main.py => test_flows.py (61%)

diff --git a/.github/workflows/test-flow.yml b/.github/workflows/test-flow.yml
new file mode 100644
index 0000000..e69de29
diff --git a/example_env.sh b/example_env.sh
index 2eeb355..9633801 100755
--- a/example_env.sh
+++ b/example_env.sh
@@ -1,3 +1,4 @@
 export S3_ACCESS_KEY=minioadmin
 export S3_SECRET_KEY=minioadmin
-export S3_ENDPOINT_URL=http://127.0.0.1:9000
+export S3_ENDPOINT_URL="http://127.0.0.1:9000"
+mc alias set ALIAS $S3_ENDPOINT_URL $S3_ACCESS_KEY $S3_SECRET_KEY
\ No newline at end of file
diff --git a/s3helper/s3_helper.py b/s3helper/s3_helper.py
index f8955e9..10ffcf6 100644
--- a/s3helper/s3_helper.py
+++ b/s3helper/s3_helper.py
@@ -4,12 +4,24 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 import sys
 import logging
-from datasets import load_dataset, Dataset
+from datasets import load_dataset, Dataset, load_from_disk
 from typing import Optional, Dict, Any
 
 # Configure logging
 logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(message)s')
 
+def find_files(directory: str, file_format: str):
+    matching_files = []
+    
+    # Walk through the directory
+    for root, _, files in os.walk(directory):
+        for file in files:
+            # Check if the file ends with the specified format
+            if file.endswith(f".{file_format}"):
+                matching_files.append(os.path.join(root, file))
+    
+    return matching_files
+
 class S3Helper:
     _instance = None
 
@@ -46,7 +58,7 @@ def validate_credentials(self):
             logging.error(f"Invalid S3 credentials: {e}")
             raise ValueError("Invalid S3 credentials")
 
-    def download_model(self, path_components: list, local_dir: str = './models'):
+    def download_file(self, path_components: list, local_dir: str):
         bucket_name = path_components[0]
         model_name = path_components[1]
         objects = self.s3_client.list_objects_v2(Bucket=bucket_name, Prefix=model_name)
@@ -59,38 +71,41 @@ def download_model(self, path_components: list, local_dir: str = './models'):
             self.s3_client.download_file(bucket_name, file_key, file_path)
             logging.info(f'Downloaded file: {file_key}')
     
-    def ensure_model_local(self, pretrained_model_name_or_path, local_dir):
-        path_components = pretrained_model_name_or_path.split("/")
+    def ensure_file_local(self, file_name_or_path: str, local_dir: str):
+        path_components = file_name_or_path.split("/")
         if len(path_components) != 2:
-            logging.error("Cannot recognize bucket name and model name since having > 2 components")
-            raise ValueError("Cannot recognize bucket name and model name since having > 2 components")
-        model_local_path = os.path.join(local_dir, pretrained_model_name_or_path)
-        if not os.path.exists(model_local_path):
-            os.makedirs(model_local_path, exist_ok=True)
-            self.download_model(path_components, local_dir)
+            logging.error("Cannot recognize bucket name and file name since the components are not 2")
+            raise ValueError("Cannot recognize bucket name and file name since the components are not 2")
+        file_local_path = os.path.join(local_dir, file_name_or_path)
+        if not os.path.exists(file_local_path):
+            os.makedirs(file_local_path, exist_ok=True)
+            self.download_file(path_components, local_dir)
         else:
-            logging.info(f"Model existed at: {model_local_path}, read from cache")
-        return model_local_path
+            if 'model' in file_name_or_path:
+                logging.info(f"Model existed at: {file_local_path}, read from cache")
+            elif 'dataset' in file_name_or_path:
+                logging.info(f"Dataset existed at: {file_local_path}, read from cache")
+        return file_local_path
 
-    def upload_to_s3(self, local_dir, bucket_name, model_name):
+    def upload_to_s3(self, local_dir, bucket_name, file_name):
         for root, _, files in os.walk(local_dir):
             for file in files:
                 local_file_path = os.path.join(root, file)
                 s3_key = os.path.relpath(local_file_path, local_dir)
-                self.s3_client.upload_file(local_file_path, bucket_name, os.path.join(model_name, s3_key))
-                logging.info(f'Uploaded {local_file_path} to s3://{bucket_name}/{model_name}/{s3_key}')
-    def download_dataset(self, path_components: list, local_dir: str = './datasets'):
-        bucket_name = path_components[0]
-        dataset_name = path_components[1]
-        objects = self.s3_client.list_objects_v2(Bucket=bucket_name, Prefix=dataset_name)
-        for obj in objects.get('Contents', []):
-            file_key = obj['Key']
-            if file_key.endswith('/'):
-                continue  # Skip directories
-            file_path = os.path.join(local_dir, bucket_name, file_key)
-            os.makedirs(os.path.dirname(file_path), exist_ok=True)
-            self.s3_client.download_file(bucket_name, file_key, file_path)
-            logging.info(f'Downloaded dataset file: {file_key}')
+                self.s3_client.upload_file(local_file_path, bucket_name, os.path.join(file_name, s3_key))
+                logging.info(f'Uploaded {local_file_path} to s3://{bucket_name}/{file_name}/{s3_key}')
+    # def download_dataset(self, path_components: list, local_dir: str = './datasets'):
+    #     bucket_name = path_components[0]
+    #     dataset_name = path_components[1]
+    #     objects = self.s3_client.list_objects_v2(Bucket=bucket_name, Prefix=dataset_name)
+    #     for obj in objects.get('Contents', []):
+    #         file_key = obj['Key']
+    #         if file_key.endswith('/'):
+    #             continue  # Skip directories
+    #         file_path = os.path.join(local_dir, bucket_name, file_key)
+    #         os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    #         self.s3_client.download_file(bucket_name, file_key, file_path)
+    #         logging.info(f'Downloaded dataset file: {file_key}')
 
 class S3HelperAutoModelForCausalLM(AutoModelForCausalLM):
     @classmethod
@@ -114,9 +129,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir:
         return super().from_pretrained(config_local_path, *model_args, **kwargs)
 # defined a custom load_dataset from S3 bucket
 def s3_load_dataset(
-    path: str,
-    local_dir: str = './datasets',
+    dataset_name_or_path: str,
     file_format: str = 'json',
+    local_dir: str = './datasets',
     *args: Any,
     **kwargs: Any
 ) -> Dataset:
@@ -125,8 +140,8 @@ def s3_load_dataset(
 
     Args:
         path (str): Path to the dataset in the format 'bucket_name/dataset_name'
+        file_format: File format of the dataset. Either 'json' or 'csv' or 'parquet'.
         local_dir (str): Local directory to store downloaded datasets
-        file_format (str): Format of the dataset file (e.g., 'json', 'csv', 'parquet')
         *args: Additional positional arguments to pass to load_dataset
         **kwargs: Additional keyword arguments to pass to load_dataset
 
@@ -134,24 +149,18 @@ def s3_load_dataset(
         Dataset: The loaded dataset
     """
     s3_helper = S3Helper.get_instance()
-    
     # Split the path into bucket and dataset name
-    path_components = path.split("/")
-    if len(path_components) != 2:
-        raise ValueError("Path should be in the format 'bucket_name/dataset_name'")
-    
-    bucket_name, dataset_name = path_components
-    dataset_local_path = os.path.join(local_dir, bucket_name, dataset_name)
-
-    # Download dataset if not exists locally
-    if not os.path.exists(dataset_local_path):
-        os.makedirs(dataset_local_path, exist_ok=True)
-        s3_helper.download_dataset(path_components, local_dir)
-    else:
-        logging.info(f"Dataset already exists at: {dataset_local_path}, using cached version")
-
-    # Construct the path to the data file
-    data_file_path = os.path.join(dataset_local_path, f"data.{file_format}")
-    
+    dataset_local_path = ensure_file_local(dataset_name_or_path, local_dir)
+    local_files = find_files(dataset_local_path, file_format)
+    dataset_local_paths = [os.path.join(dataset_local_path, file) for file in local_files]
+    train_local_paths = []
+    test_local_paths = []   
+    for file in dataset_local_paths:
+        if "train" in file:
+            train_local_paths.append(file)
+        elif "test" in file:
+            test_local_paths.append(file)
+        else:
+            raise ValueError("Not Implemented")
     # Load and return the dataset
-    return load_dataset(file_format, data_files=data_file_path, *args, **kwargs)
\ No newline at end of file
+    return load_dataset(file_format, data_files={'train': train_local_paths, "test": test_local_paths}, *args, **kwargs)
\ No newline at end of file
diff --git a/main.py b/test_flows.py
similarity index 61%
rename from main.py
rename to test_flows.py
index 1ba6bad..cbd8dbe 100644
--- a/main.py
+++ b/test_flows.py
@@ -3,7 +3,7 @@
 
 os.environ['S3_ACCESS_KEY'] = 'minioadmin'
 os.environ['S3_SECRET_KEY'] = 'minioadmin'
-os.environ['S3_ENDPOINT_URL'] = 'http://172.17.0.2:9001'
+os.environ['S3_ENDPOINT_URL'] = 'http://172.17.0.2:9000'
 S3Helper()
 
 # # Example usage
@@ -12,12 +12,5 @@
 # tokenizer = S3HelperAutoTokenizer.from_pretrained(model_name)
 # config = S3HelperAutoConfig.from_pretrained(model_name)
 # Make sure S3Helper is initialized and environment variables are set
-# Load a dataset
-dataset = s3_load_dataset("modelhubjan/test_dataset")
-
-# Use the dataset
-for item in dataset:
-    print(item)
-
-# You can also pass additional arguments to load_dataset
-dataset = s3_load_dataset("modelhubjan/test_dataset", file_format='parquet', split='train')
\ No newline at end of file
+# Load a dataset from S3 bucket
+dataset = s3_load_dataset("jan-hq/test_dataset",file_format='parquet', split='train')

From b3cad27e10e1faa9b3237ce0eeac756422d9fd60 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Tue, 16 Jul 2024 17:24:46 +0700
Subject: [PATCH 02/13] debug and correct version's dependency

---
 s3helper/s3_helper.py | 68 ++++++++++++++++++++++++++-----------------
 setup.py              |  6 ++--
 test_flows.py         |  7 +++--
 3 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/s3helper/s3_helper.py b/s3helper/s3_helper.py
index 10ffcf6..a493c14 100644
--- a/s3helper/s3_helper.py
+++ b/s3helper/s3_helper.py
@@ -5,8 +5,7 @@
 import sys
 import logging
 from datasets import load_dataset, Dataset, load_from_disk
-from typing import Optional, Dict, Any
-
+from typing import Optional, Dict, Any, List
 # Configure logging
 logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(message)s')
 
@@ -81,9 +80,10 @@ def ensure_file_local(self, file_name_or_path: str, local_dir: str):
             os.makedirs(file_local_path, exist_ok=True)
             self.download_file(path_components, local_dir)
         else:
-            if 'model' in file_name_or_path:
+            if 'model' in local_dir.lower():
+                
                 logging.info(f"Model existed at: {file_local_path}, read from cache")
-            elif 'dataset' in file_name_or_path:
+            elif 'dataset' in local_dir.lower():
                 logging.info(f"Dataset existed at: {file_local_path}, read from cache")
         return file_local_path
 
@@ -111,56 +111,72 @@ class S3HelperAutoModelForCausalLM(AutoModelForCausalLM):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir: str = './models', **kwargs):
         s3_helper = S3Helper.get_instance()
-        model_local_path = s3_helper.ensure_model_local(pretrained_model_name_or_path, local_dir)
+        model_local_path = s3_helper.ensure_file_local(pretrained_model_name_or_path, local_dir)
         return super().from_pretrained(model_local_path, *model_args, **kwargs)
 
 class S3HelperAutoTokenizer(AutoTokenizer):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir: str = './models', **kwargs):
         s3_helper = S3Helper.get_instance()
-        tokenizer_local_path = s3_helper.ensure_model_local(pretrained_model_name_or_path, local_dir)
+        tokenizer_local_path = s3_helper.ensure_file_local(pretrained_model_name_or_path, local_dir)
         return super().from_pretrained(tokenizer_local_path, *model_args, **kwargs)
 
 class S3HelperAutoConfig(AutoConfig):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, local_dir: str = './models', **kwargs):
         s3_helper = S3Helper.get_instance()
-        config_local_path = s3_helper.ensure_model_local(pretrained_model_name_or_path, local_dir)
+        config_local_path = s3_helper.ensure_file_local(pretrained_model_name_or_path, local_dir)
         return super().from_pretrained(config_local_path, *model_args, **kwargs)
 # defined a custom load_dataset from S3 bucket
 def s3_load_dataset(
     dataset_name_or_path: str,
     file_format: str = 'json',
     local_dir: str = './datasets',
+    split: str = None,
     *args: Any,
     **kwargs: Any
 ) -> Dataset:
     """
     Load a dataset from S3/Minio storage.
-
     Args:
-        path (str): Path to the dataset in the format 'bucket_name/dataset_name'
-        file_format: File format of the dataset. Either 'json' or 'csv' or 'parquet'.
-        local_dir (str): Local directory to store downloaded datasets
-        *args: Additional positional arguments to pass to load_dataset
-        **kwargs: Additional keyword arguments to pass to load_dataset
-
+    dataset_name_or_path (str): Path to the dataset in the format 'bucket_name/dataset_name'
+    file_format (str): File format of the dataset. Either 'json', 'csv', or 'parquet'.
+    local_dir (str): Local directory to store downloaded datasets
+    split (str): Dataset split to load ('train', 'test', or None for all)
+    *args: Additional positional arguments to pass to load_dataset
+    **kwargs: Additional keyword arguments to pass to load_dataset
     Returns:
-        Dataset: The loaded dataset
+    Dataset: The loaded dataset
     """
     s3_helper = S3Helper.get_instance()
-    # Split the path into bucket and dataset name
-    dataset_local_path = ensure_file_local(dataset_name_or_path, local_dir)
+    dataset_local_path = s3_helper.ensure_file_local(dataset_name_or_path, local_dir)
+    
+    def find_files(path: str, extension: str) -> List[str]:
+        return [os.path.join(root, file) for root, _, files in os.walk(path) 
+                for file in files if file.endswith(f'.{extension}')]
+    
     local_files = find_files(dataset_local_path, file_format)
-    dataset_local_paths = [os.path.join(dataset_local_path, file) for file in local_files]
-    train_local_paths = []
-    test_local_paths = []   
-    for file in dataset_local_paths:
+    logging.info(f"Found local files: {local_files}")
+    
+    data_files: Dict[str, List[str]] = {"train": [], "test": []}
+    for file in local_files:
         if "train" in file:
-            train_local_paths.append(file)
+            data_files["train"].append(file)
         elif "test" in file:
-            test_local_paths.append(file)
+            data_files["test"].append(file)
         else:
-            raise ValueError("Not Implemented")
-    # Load and return the dataset
-    return load_dataset(file_format, data_files={'train': train_local_paths, "test": test_local_paths}, *args, **kwargs)
\ No newline at end of file
+            logging.warning(f"Unclassified file: {file}")
+    
+    if split:
+        if split not in data_files:
+            raise ValueError(f"Invalid split: {split}. Available splits are: {list(data_files.keys())}")
+        data_files = {split: data_files[split]}
+    
+    # Remove empty splits
+    data_files = {k: v for k, v in data_files.items() if v}
+    
+    if not data_files:
+        raise ValueError(f"No valid files found for the specified format and split.")
+    
+    logging.info(f"Loading dataset with data_files: {data_files}")
+    return load_dataset(file_format, data_files=data_files, *args, **kwargs)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index eb072c1..818a845 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 
 setup(
     name='research-utils',
-    version='0.2.0',  # Increment the version number
+    version='0.2.1',  # Increment the version number
     description='A helper library for working with S3/Minio, Hugging Face models, and datasets',
     long_description='This library provides utilities for downloading and managing machine learning models and datasets from S3-compatible storage services, and loading them using the Hugging Face libraries.',
     author='Alan',
@@ -12,8 +12,10 @@
     packages=find_packages(),
     install_requires=[
         'boto3',
+        # tokenizers >=0.13.3
+        'tokenizers==0.13.3',
         'transformers',
-        'datasets',  # Add the datasets library
+        'datasets==2.20.0',  # Add the datasets library
     ],
     classifiers=[
         'Programming Language :: Python :: 3',
diff --git a/test_flows.py b/test_flows.py
index cbd8dbe..2031fc7 100644
--- a/test_flows.py
+++ b/test_flows.py
@@ -7,10 +7,11 @@
 S3Helper()
 
 # # Example usage
-# model_name = "thunghiem/tinyllama"
+model_name = "jan-hq/tokenizer-tinyllama"
 # model = S3HelperAutoModelForCausalLM.from_pretrained(model_name)
-# tokenizer = S3HelperAutoTokenizer.from_pretrained(model_name)
+tokenizer = S3HelperAutoTokenizer.from_pretrained(model_name)
+# print(tokenizer)
 # config = S3HelperAutoConfig.from_pretrained(model_name)
 # Make sure S3Helper is initialized and environment variables are set
 # Load a dataset from S3 bucket
-dataset = s3_load_dataset("jan-hq/test_dataset",file_format='parquet', split='train')
+dataset = s3_load_dataset("jan-hq/test-dataset",file_format='parquet', split='train')

From a8e99246ab383871dad998640d9de9f461a4af41 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Tue, 16 Jul 2024 17:32:21 +0700
Subject: [PATCH 03/13] add github CI-CD and Minio host script

---
 .github/workflows/test-flow.yml | 27 +++++++++++++++++++++++++++
 Minio_host.sh                   |  9 +++++++++
 test_flows.py                   |  3 +++
 3 files changed, 39 insertions(+)
 create mode 100644 Minio_host.sh

diff --git a/.github/workflows/test-flow.yml b/.github/workflows/test-flow.yml
index e69de29..c39ceef 100644
--- a/.github/workflows/test-flow.yml
+++ b/.github/workflows/test-flow.yml
@@ -0,0 +1,27 @@
+name: CI/CD Test
+
+on:
+  push:
+    branches: main
+  pull_request:
+    branches: main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e .
+    
+    - name: Run test_flow.py
+      run: python test_flows.py
\ No newline at end of file
diff --git a/Minio_host.sh b/Minio_host.sh
new file mode 100644
index 0000000..214f378
--- /dev/null
+++ b/Minio_host.sh
@@ -0,0 +1,9 @@
+# mkdir -p ~/minio/data
+
+docker run \
+   -p 9000:9000 \
+   -p 9001:9001 \
+   --name minio \
+   -e "MINIO_ROOT_USER=minioadmin" \
+   -e "MINIO_ROOT_PASSWORD=minioadmin" \
+   quay.io/minio/minio server /data --console-address ":9001"
\ No newline at end of file
diff --git a/test_flows.py b/test_flows.py
index 2031fc7..dcb987b 100644
--- a/test_flows.py
+++ b/test_flows.py
@@ -1,5 +1,6 @@
 from s3helper import S3Helper,S3HelperAutoConfig,S3HelperAutoTokenizer,S3HelperAutoModelForCausalLM, s3_load_dataset
 import os
+import logging
 
 os.environ['S3_ACCESS_KEY'] = 'minioadmin'
 os.environ['S3_SECRET_KEY'] = 'minioadmin'
@@ -10,8 +11,10 @@
 model_name = "jan-hq/tokenizer-tinyllama"
 # model = S3HelperAutoModelForCausalLM.from_pretrained(model_name)
 tokenizer = S3HelperAutoTokenizer.from_pretrained(model_name)
+logging.info(f"Tokenizer Loading successful: {tokenizer}")
 # print(tokenizer)
 # config = S3HelperAutoConfig.from_pretrained(model_name)
 # Make sure S3Helper is initialized and environment variables are set
 # Load a dataset from S3 bucket
 dataset = s3_load_dataset("jan-hq/test-dataset",file_format='parquet', split='train')
+logging.info(f"Dataset Loading successful")
\ No newline at end of file

From 245b8f3f38b23f3daad496d2b27353a916cf6b84 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Tue, 16 Jul 2024 19:08:33 +0700
Subject: [PATCH 04/13] update self-hosted runner

---
 .github/runners/Dockerfile      | 52 +++++++++++++++++++++++++++++++++
 .github/runners/start.sh        | 21 +++++++++++++
 .github/workflows/test-flow.yml |  7 +++--
 3 files changed, 77 insertions(+), 3 deletions(-)
 create mode 100644 .github/runners/Dockerfile
 create mode 100644 .github/runners/start.sh

diff --git a/.github/runners/Dockerfile b/.github/runners/Dockerfile
new file mode 100644
index 0000000..6f5827d
--- /dev/null
+++ b/.github/runners/Dockerfile
@@ -0,0 +1,52 @@
+FROM docker.io/pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
+
+RUN apt-get update \
+  && apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    sudo \
+    unzip \
+    curl \
+    wget \
+    git \
+    git-lfs \
+    jq \
+  && rm -rf /var/lib/apt/lists/*
+
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
+  unzip awscliv2.zip && \
+  ./aws/install
+
+ENV HOME=/home/runner
+
+RUN mkdir -p /home/runner
+
+ARG RUNNER_VERSION=2.317.0
+
+ARG RUNNER_UID=1000
+ARG DOCKER_GID=1001
+
+RUN adduser --disabled-password --gecos "" --uid $RUNNER_UID runner \
+    && groupadd docker --gid $DOCKER_GID \
+    && usermod -aG sudo runner \
+    && usermod -aG docker runner \
+    && echo "%sudo   ALL=(ALL:ALL) NOPASSWD:ALL" > /etc/sudoers \
+    && echo "Defaults env_keep += \"DEBIAN_FRONTEND\"" >> /etc/sudoers
+
+# cd into the user directory, download and unzip the github actions runner
+RUN cd /home/runner && mkdir actions-runner && cd actions-runner \
+    && curl -O -L https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz \
+    && tar xzf ./actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz
+
+RUN chown -R runner:runner /home/runner && /home/runner/actions-runner/bin/installdependencies.sh
+
+ADD ./start.sh /home/runner/start.sh
+
+RUN chmod +x /home/runner/start.sh
+
+# Add /usr/local/cuda-11.7/compat to LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda-12.1/compat${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+
+ENTRYPOINT ["/bin/bash", "/home/runner/start.sh"]
+
+USER runner
\ No newline at end of file
diff --git a/.github/runners/start.sh b/.github/runners/start.sh
new file mode 100644
index 0000000..84d3c3d
--- /dev/null
+++ b/.github/runners/start.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+RUNNER_REPO=$RUNNER_REPO
+RUNNER_PAT=$RUNNER_PAT
+RUNNER_GROUP=$RUNNER_GROUP
+RUNNER_LABELS=$RUNNER_LABELS
+RUNNER_NAME=$(hostname)
+
+cd /home/runner/actions-runner
+
+./config.sh --unattended --replace --url https://github.com/${RUNNER_REPO} --pat ${RUNNER_PAT} --name ${RUNNER_NAME} --runnergroup ${RUNNER_GROUP} --labels ${RUNNER_LABELS} --work /home/runner/actions-runner/_work
+
+cleanup() {
+    echo "Removing runner..."
+    ./config.sh remove --unattended --pat ${RUNNER_PAT}
+}
+
+trap 'cleanup; exit 130' INT
+trap 'cleanup; exit 143' TERM
+
+./run.sh & wait $!
\ No newline at end of file
diff --git a/.github/workflows/test-flow.yml b/.github/workflows/test-flow.yml
index c39ceef..dbc3cbb 100644
--- a/.github/workflows/test-flow.yml
+++ b/.github/workflows/test-flow.yml
@@ -8,15 +8,16 @@ on:
 
 jobs:
   test:
-    runs-on: ubuntu-latest
+    runs-on: - ubuntu-latest
+             - research
     
     steps:
     - uses: actions/checkout@v4
     
     - name: Set up Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v2
       with:
-        python-version: '3.10'
+        python-version: '3.11'
     
     - name: Install dependencies
       run: |

From 69592477020b27642cbca62e7862a9b54e94395b Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Tue, 16 Jul 2024 20:51:56 +0700
Subject: [PATCH 05/13] Revert "update self-hosted runner"

This reverts commit 3ed486ce7a6d7d04131aadd3e7182eae11270258.
---
 .github/runners/Dockerfile      | 52 ---------------------------------
 .github/runners/start.sh        | 21 -------------
 .github/workflows/test-flow.yml |  7 ++---
 3 files changed, 3 insertions(+), 77 deletions(-)
 delete mode 100644 .github/runners/Dockerfile
 delete mode 100644 .github/runners/start.sh

diff --git a/.github/runners/Dockerfile b/.github/runners/Dockerfile
deleted file mode 100644
index 6f5827d..0000000
--- a/.github/runners/Dockerfile
+++ /dev/null
@@ -1,52 +0,0 @@
-FROM docker.io/pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
-
-RUN apt-get update \
-  && apt-get install -y --no-install-recommends \
-    build-essential \
-    cmake \
-    sudo \
-    unzip \
-    curl \
-    wget \
-    git \
-    git-lfs \
-    jq \
-  && rm -rf /var/lib/apt/lists/*
-
-RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
-  unzip awscliv2.zip && \
-  ./aws/install
-
-ENV HOME=/home/runner
-
-RUN mkdir -p /home/runner
-
-ARG RUNNER_VERSION=2.317.0
-
-ARG RUNNER_UID=1000
-ARG DOCKER_GID=1001
-
-RUN adduser --disabled-password --gecos "" --uid $RUNNER_UID runner \
-    && groupadd docker --gid $DOCKER_GID \
-    && usermod -aG sudo runner \
-    && usermod -aG docker runner \
-    && echo "%sudo   ALL=(ALL:ALL) NOPASSWD:ALL" > /etc/sudoers \
-    && echo "Defaults env_keep += \"DEBIAN_FRONTEND\"" >> /etc/sudoers
-
-# cd into the user directory, download and unzip the github actions runner
-RUN cd /home/runner && mkdir actions-runner && cd actions-runner \
-    && curl -O -L https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz \
-    && tar xzf ./actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz
-
-RUN chown -R runner:runner /home/runner && /home/runner/actions-runner/bin/installdependencies.sh
-
-ADD ./start.sh /home/runner/start.sh
-
-RUN chmod +x /home/runner/start.sh
-
-# Add /usr/local/cuda-11.7/compat to LD_LIBRARY_PATH
-ENV LD_LIBRARY_PATH=/usr/local/cuda-12.1/compat${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
-
-ENTRYPOINT ["/bin/bash", "/home/runner/start.sh"]
-
-USER runner
\ No newline at end of file
diff --git a/.github/runners/start.sh b/.github/runners/start.sh
deleted file mode 100644
index 84d3c3d..0000000
--- a/.github/runners/start.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-RUNNER_REPO=$RUNNER_REPO
-RUNNER_PAT=$RUNNER_PAT
-RUNNER_GROUP=$RUNNER_GROUP
-RUNNER_LABELS=$RUNNER_LABELS
-RUNNER_NAME=$(hostname)
-
-cd /home/runner/actions-runner
-
-./config.sh --unattended --replace --url https://github.com/${RUNNER_REPO} --pat ${RUNNER_PAT} --name ${RUNNER_NAME} --runnergroup ${RUNNER_GROUP} --labels ${RUNNER_LABELS} --work /home/runner/actions-runner/_work
-
-cleanup() {
-    echo "Removing runner..."
-    ./config.sh remove --unattended --pat ${RUNNER_PAT}
-}
-
-trap 'cleanup; exit 130' INT
-trap 'cleanup; exit 143' TERM
-
-./run.sh & wait $!
\ No newline at end of file
diff --git a/.github/workflows/test-flow.yml b/.github/workflows/test-flow.yml
index dbc3cbb..c39ceef 100644
--- a/.github/workflows/test-flow.yml
+++ b/.github/workflows/test-flow.yml
@@ -8,16 +8,15 @@ on:
 
 jobs:
   test:
-    runs-on: - ubuntu-latest
-             - research
+    runs-on: ubuntu-latest
     
     steps:
     - uses: actions/checkout@v4
     
     - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
-        python-version: '3.11'
+        python-version: '3.10'
     
     - name: Install dependencies
       run: |

From 52acb7cb499d7dec95e6456249678f3b0f8dc3d0 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Tue, 16 Jul 2024 21:00:30 +0700
Subject: [PATCH 06/13] add self-hosted runner

---
 .github/runners/Dockerfile      | 52 +++++++++++++++++++++++++++++++++
 .github/runners/start.sh        | 21 +++++++++++++
 .github/workflows/test-flow.yml |  3 +-
 3 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 .github/runners/Dockerfile
 create mode 100644 .github/runners/start.sh

diff --git a/.github/runners/Dockerfile b/.github/runners/Dockerfile
new file mode 100644
index 0000000..6f5827d
--- /dev/null
+++ b/.github/runners/Dockerfile
@@ -0,0 +1,52 @@
+FROM docker.io/pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
+
+RUN apt-get update \
+  && apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    sudo \
+    unzip \
+    curl \
+    wget \
+    git \
+    git-lfs \
+    jq \
+  && rm -rf /var/lib/apt/lists/*
+
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
+  unzip awscliv2.zip && \
+  ./aws/install
+
+ENV HOME=/home/runner
+
+RUN mkdir -p /home/runner
+
+ARG RUNNER_VERSION=2.317.0
+
+ARG RUNNER_UID=1000
+ARG DOCKER_GID=1001
+
+RUN adduser --disabled-password --gecos "" --uid $RUNNER_UID runner \
+    && groupadd docker --gid $DOCKER_GID \
+    && usermod -aG sudo runner \
+    && usermod -aG docker runner \
+    && echo "%sudo   ALL=(ALL:ALL) NOPASSWD:ALL" > /etc/sudoers \
+    && echo "Defaults env_keep += \"DEBIAN_FRONTEND\"" >> /etc/sudoers
+
+# cd into the user directory, download and unzip the github actions runner
+RUN cd /home/runner && mkdir actions-runner && cd actions-runner \
+    && curl -O -L https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz \
+    && tar xzf ./actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz
+
+RUN chown -R runner:runner /home/runner && /home/runner/actions-runner/bin/installdependencies.sh
+
+ADD ./start.sh /home/runner/start.sh
+
+RUN chmod +x /home/runner/start.sh
+
+# Add /usr/local/cuda-11.7/compat to LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda-12.1/compat${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+
+ENTRYPOINT ["/bin/bash", "/home/runner/start.sh"]
+
+USER runner
\ No newline at end of file
diff --git a/.github/runners/start.sh b/.github/runners/start.sh
new file mode 100644
index 0000000..84d3c3d
--- /dev/null
+++ b/.github/runners/start.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+RUNNER_REPO=$RUNNER_REPO
+RUNNER_PAT=$RUNNER_PAT
+RUNNER_GROUP=$RUNNER_GROUP
+RUNNER_LABELS=$RUNNER_LABELS
+RUNNER_NAME=$(hostname)
+
+cd /home/runner/actions-runner
+
+./config.sh --unattended --replace --url https://github.com/${RUNNER_REPO} --pat ${RUNNER_PAT} --name ${RUNNER_NAME} --runnergroup ${RUNNER_GROUP} --labels ${RUNNER_LABELS} --work /home/runner/actions-runner/_work
+
+cleanup() {
+    echo "Removing runner..."
+    ./config.sh remove --unattended --pat ${RUNNER_PAT}
+}
+
+trap 'cleanup; exit 130' INT
+trap 'cleanup; exit 143' TERM
+
+./run.sh & wait $!
\ No newline at end of file
diff --git a/.github/workflows/test-flow.yml b/.github/workflows/test-flow.yml
index c39ceef..ae463b3 100644
--- a/.github/workflows/test-flow.yml
+++ b/.github/workflows/test-flow.yml
@@ -8,7 +8,8 @@ on:
 
 jobs:
   test:
-    runs-on: ubuntu-latest
+    runs-on: - ubuntu-latest
+             - research
     
     steps:
     - uses: actions/checkout@v4

From 18a7dbaeea3591a43aac0916236df3311388c713 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Tue, 16 Jul 2024 21:04:21 +0700
Subject: [PATCH 07/13] debug

---
 .github/workflows/test-flow.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/test-flow.yml b/.github/workflows/test-flow.yml
index ae463b3..838f5ad 100644
--- a/.github/workflows/test-flow.yml
+++ b/.github/workflows/test-flow.yml
@@ -8,8 +8,7 @@ on:
 
 jobs:
   test:
-    runs-on: - ubuntu-latest
-             - research
+    runs-on: research
     
     steps:
     - uses: actions/checkout@v4

From b2d4964d4374edc147b33b9401b3091eb7e04732 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Tue, 16 Jul 2024 21:46:18 +0700
Subject: [PATCH 08/13] update dockerfile

---
 .github/runners/Dockerfile | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/.github/runners/Dockerfile b/.github/runners/Dockerfile
index 6f5827d..7fc0d93 100644
--- a/.github/runners/Dockerfile
+++ b/.github/runners/Dockerfile
@@ -1,17 +1,19 @@
-FROM docker.io/pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
+FROM python:3.11-slim
 
-RUN apt-get update \
-  && apt-get install -y --no-install-recommends \
-    build-essential \
-    cmake \
-    sudo \
-    unzip \
+# Install necessary packages
+RUN apt-get update && apt-get install -y \
     curl \
-    wget \
+    tar \
+    unzip \
     git \
-    git-lfs \
     jq \
-  && rm -rf /var/lib/apt/lists/*
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install MinIO client
+RUN wget https://dl.min.io/client/mc/release/linux-amd64/mc \
+    && chmod +x mc \
+    && mv mc /usr/local/bin/mc
 
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
   unzip awscliv2.zip && \

From f6c82c19050f380f8a5dabdbede06f280d3afbf5 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Tue, 16 Jul 2024 22:01:30 +0700
Subject: [PATCH 09/13] update Dockerfile

---
 .github/runners/Dockerfile | 5 -----
 test_flows.py              | 6 +++---
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/.github/runners/Dockerfile b/.github/runners/Dockerfile
index 7fc0d93..68e6bd7 100644
--- a/.github/runners/Dockerfile
+++ b/.github/runners/Dockerfile
@@ -10,11 +10,6 @@ RUN apt-get update && apt-get install -y \
     wget \
     && rm -rf /var/lib/apt/lists/*
 
-# Install MinIO client
-RUN wget https://dl.min.io/client/mc/release/linux-amd64/mc \
-    && chmod +x mc \
-    && mv mc /usr/local/bin/mc
-
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
   unzip awscliv2.zip && \
   ./aws/install
diff --git a/test_flows.py b/test_flows.py
index dcb987b..00f506d 100644
--- a/test_flows.py
+++ b/test_flows.py
@@ -2,9 +2,9 @@
 import os
 import logging
 
-os.environ['S3_ACCESS_KEY'] = 'minioadmin'
-os.environ['S3_SECRET_KEY'] = 'minioadmin'
-os.environ['S3_ENDPOINT_URL'] = 'http://172.17.0.2:9000'
+# os.environ['S3_ACCESS_KEY'] = 'minioadmin'
+# os.environ['S3_SECRET_KEY'] = 'minioadmin'
+# os.environ['S3_ENDPOINT_URL'] = 'http://172.17.0.2:9000'
 S3Helper()
 
 # # Example usage

From 4cda605f44bc3edb8ce08384c6196c4be5db4c57 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Wed, 17 Jul 2024 10:40:37 +0700
Subject: [PATCH 10/13] CICD for research-utils

---
 .github/runners/Dockerfile      | 3 ---
 .github/workflows/test-flow.yml | 9 +++++++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.github/runners/Dockerfile b/.github/runners/Dockerfile
index 68e6bd7..4674aa3 100644
--- a/.github/runners/Dockerfile
+++ b/.github/runners/Dockerfile
@@ -41,9 +41,6 @@ ADD ./start.sh /home/runner/start.sh
 
 RUN chmod +x /home/runner/start.sh
 
-# Add /usr/local/cuda-11.7/compat to LD_LIBRARY_PATH
-ENV LD_LIBRARY_PATH=/usr/local/cuda-12.1/compat${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
-
 ENTRYPOINT ["/bin/bash", "/home/runner/start.sh"]
 
 USER runner
\ No newline at end of file
diff --git a/.github/workflows/test-flow.yml b/.github/workflows/test-flow.yml
index 838f5ad..f21531d 100644
--- a/.github/workflows/test-flow.yml
+++ b/.github/workflows/test-flow.yml
@@ -5,10 +5,11 @@ on:
     branches: main
   pull_request:
     branches: main
+  workflow_dispatch:
 
 jobs:
   test:
-    runs-on: research
+    runs-on: research-utils
     
     steps:
     - uses: actions/checkout@v4
@@ -24,4 +25,8 @@ jobs:
         pip install -e .
     
     - name: Run test_flow.py
-      run: python test_flows.py
\ No newline at end of file
+      run: python test_flows.py
+      env:
+        S3_ACCESS_KEY: ${{ secrets.MINIO_ACCESS_KEY_ID }}
+        S3_SECRET_KEY: ${{ secrets.MINIO_SECRET_ACCESS_KEY }}
+        S3_ENDPOINT_URL: ${{ secrets.MINIO_ENDPOINT }}
\ No newline at end of file

From bff24d022c87e3cbfdd11b335646631e7c16c4b0 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Tue, 16 Jul 2024 22:12:01 +0700
Subject: [PATCH 11/13] update dockerfile

---
 .github/runners/Dockerfile | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/runners/Dockerfile b/.github/runners/Dockerfile
index 4674aa3..493489d 100644
--- a/.github/runners/Dockerfile
+++ b/.github/runners/Dockerfile
@@ -1,14 +1,19 @@
 FROM python:3.11-slim
 
 # Install necessary packages
-RUN apt-get update && apt-get install -y \
-    curl \
-    tar \
+RUN apt-get update \
+  && apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    sudo \
     unzip \
+    curl \
+    wget \
     git \
+    git-lfs \
     jq \
-    wget \
-    && rm -rf /var/lib/apt/lists/*
+  && rm -rf /var/lib/apt/lists/*
+
 
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
   unzip awscliv2.zip && \

From d91717045d25dcabdbcf6270a1053f2df7c8b19c Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Wed, 17 Jul 2024 11:04:05 +0700
Subject: [PATCH 12/13] remove CI python setup

---
 .github/workflows/test-flow.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/test-flow.yml b/.github/workflows/test-flow.yml
index f21531d..f2ff92e 100644
--- a/.github/workflows/test-flow.yml
+++ b/.github/workflows/test-flow.yml
@@ -14,11 +14,6 @@ jobs:
     steps:
     - uses: actions/checkout@v4
     
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.10'
-    
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip

From 2d4f077958502f57ad5853921af652aacbab8cd5 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Wed, 17 Jul 2024 11:08:57 +0700
Subject: [PATCH 13/13] configure a bucket for testing

---
 test_flows.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_flows.py b/test_flows.py
index 00f506d..aeea043 100644
--- a/test_flows.py
+++ b/test_flows.py
@@ -8,7 +8,7 @@
 S3Helper()
 
 # # Example usage
-model_name = "jan-hq/tokenizer-tinyllama"
+model_name = "jan-hq-test/tokenizer-tinyllama"
 # model = S3HelperAutoModelForCausalLM.from_pretrained(model_name)
 tokenizer = S3HelperAutoTokenizer.from_pretrained(model_name)
 logging.info(f"Tokenizer Loading successful: {tokenizer}")
@@ -16,5 +16,5 @@
 # config = S3HelperAutoConfig.from_pretrained(model_name)
 # Make sure S3Helper is initialized and environment variables are set
 # Load a dataset from S3 bucket
-dataset = s3_load_dataset("jan-hq/test-dataset",file_format='parquet', split='train')
+dataset = s3_load_dataset("jan-hq-test/test-dataset",file_format='parquet', split='train')
 logging.info(f"Dataset Loading successful")
\ No newline at end of file