From a8226880f7309e2d7bc168ebd8ca282970ca0125 Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Thu, 29 Aug 2024 15:06:04 +0100
Subject: [PATCH] Cherry pick of #2180 #2230 into v1.8-branch (#2242)

* Update `huggingface_hub` Version in the storage initializer to fix ImportError (#2180)

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* [SDK] Fix trainer error: Update the version of base image and add "num_labels" for downloading pretrained models (#2230)

* fix trainer error

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>

* rerun tests

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>

* update the process of num_labels in trainer

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>

* rerun tests

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>

* adjust the  default value of 'num_labels'

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>

---------

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

---------

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Co-authored-by: Hezhi Xie <hezxie@ucdavis.edu>
Co-authored-by: Hezhi (Helen) Xie <helenxiehz@gmail.com>
---
 .../storage_initializer/hugging_face.py       |  1 +
 .../storage_initializer/requirements.txt      |  2 +-
 sdk/python/kubeflow/trainer/Dockerfile        |  2 +-
 .../kubeflow/trainer/hf_llm_training.py       | 26 +++++++++++++------
 .../kubeflow/training/api/training_client.py  |  2 ++
 5 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/sdk/python/kubeflow/storage_initializer/hugging_face.py b/sdk/python/kubeflow/storage_initializer/hugging_face.py
index 4b5b0794a9..33be724cf0 100644
--- a/sdk/python/kubeflow/storage_initializer/hugging_face.py
+++ b/sdk/python/kubeflow/storage_initializer/hugging_face.py
@@ -38,6 +38,7 @@ class HuggingFaceModelParams:
     model_uri: str
     transformer_type: TRANSFORMER_TYPES
     access_token: str = None
+    num_labels: Optional[int] = None
 
     def __post_init__(self):
         # Custom checks or validations can be added here
diff --git a/sdk/python/kubeflow/storage_initializer/requirements.txt b/sdk/python/kubeflow/storage_initializer/requirements.txt
index d795aa99f5..24ad9f7675 100644
--- a/sdk/python/kubeflow/storage_initializer/requirements.txt
+++ b/sdk/python/kubeflow/storage_initializer/requirements.txt
@@ -2,4 +2,4 @@ peft==0.3.0
 datasets==2.15.0
 transformers==4.38.0
 boto3==1.33.9
-huggingface_hub==0.19.3
+huggingface_hub==0.23.4
diff --git a/sdk/python/kubeflow/trainer/Dockerfile b/sdk/python/kubeflow/trainer/Dockerfile
index d0ebee4aa3..6b98e3de31 100644
--- a/sdk/python/kubeflow/trainer/Dockerfile
+++ b/sdk/python/kubeflow/trainer/Dockerfile
@@ -1,5 +1,5 @@
 # Use an official Pytorch runtime as a parent image
-FROM nvcr.io/nvidia/pytorch:23.10-py3
+FROM nvcr.io/nvidia/pytorch:24.06-py3
 
 # Set the working directory in the container
 WORKDIR /app
diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py
index 26dd4fbe0e..5b3a4360fb 100644
--- a/sdk/python/kubeflow/trainer/hf_llm_training.py
+++ b/sdk/python/kubeflow/trainer/hf_llm_training.py
@@ -29,17 +29,26 @@
 logger.setLevel(logging.INFO)
 
 
-def setup_model_and_tokenizer(model_uri, transformer_type, model_dir):
+def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, num_labels):
     # Set up the model and tokenizer
     parsed_uri = urlparse(model_uri)
     model_name = parsed_uri.netloc + parsed_uri.path
 
-    model = transformer_type.from_pretrained(
-        pretrained_model_name_or_path=model_name,
-        cache_dir=model_dir,
-        local_files_only=True,
-        trust_remote_code=True,
-    )
+    if num_labels != "None":
+        model = transformer_type.from_pretrained(
+            pretrained_model_name_or_path=model_name,
+            cache_dir=model_dir,
+            local_files_only=True,
+            trust_remote_code=True,
+            num_labels=int(num_labels),
+        )
+    else:
+        model = transformer_type.from_pretrained(
+            pretrained_model_name_or_path=model_name,
+            cache_dir=model_dir,
+            local_files_only=True,
+            trust_remote_code=True,
+        )
 
     tokenizer = AutoTokenizer.from_pretrained(
         pretrained_model_name_or_path=model_name,
@@ -145,6 +154,7 @@ def parse_arguments():
 
     parser.add_argument("--model_uri", help="model uri")
     parser.add_argument("--transformer_type", help="model transformer type")
+    parser.add_argument("--num_labels", default="None", help="number of classes")
     parser.add_argument("--model_dir", help="directory containing model")
     parser.add_argument("--dataset_dir", help="directory containing dataset")
     parser.add_argument("--lora_config", help="lora_config")
@@ -163,7 +173,7 @@ def parse_arguments():
 
     logger.info("Setup model and tokenizer")
     model, tokenizer = setup_model_and_tokenizer(
-        args.model_uri, transformer_type, args.model_dir
+        args.model_uri, transformer_type, args.model_dir, args.num_labels
     )
 
     logger.info("Preprocess dataset")
diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py
index edac130194..4165904b21 100644
--- a/sdk/python/kubeflow/training/api/training_client.py
+++ b/sdk/python/kubeflow/training/api/training_client.py
@@ -264,6 +264,8 @@ def train(
                 model_provider_parameters.model_uri,
                 "--transformer_type",
                 model_provider_parameters.transformer_type.__name__,
+                "--num_labels",
+                str(model_provider_parameters.num_labels),
                 "--model_dir",
                 VOLUME_PATH_MODEL,
                 "--dataset_dir",