From 5b0f796685d12107e2349ab75e2c8e514427ebf1 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 20 Aug 2024 22:18:14 +0800 Subject: [PATCH 1/5] fix trainer error Signed-off-by: helenxie-bit --- .../storage_initializer/hugging_face.py | 1 + sdk/python/kubeflow/trainer/Dockerfile | 2 +- .../kubeflow/trainer/hf_llm_training.py | 35 +++++++++++++------ .../kubeflow/training/api/training_client.py | 2 ++ 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/sdk/python/kubeflow/storage_initializer/hugging_face.py b/sdk/python/kubeflow/storage_initializer/hugging_face.py index 0fbf511350..5c31feef6f 100644 --- a/sdk/python/kubeflow/storage_initializer/hugging_face.py +++ b/sdk/python/kubeflow/storage_initializer/hugging_face.py @@ -39,6 +39,7 @@ class HuggingFaceModelParams: model_uri: str transformer_type: TRANSFORMER_TYPES access_token: str = None + num_labels: Optional[int] = None def __post_init__(self): # Custom checks or validations can be added here diff --git a/sdk/python/kubeflow/trainer/Dockerfile b/sdk/python/kubeflow/trainer/Dockerfile index d0ebee4aa3..6b98e3de31 100644 --- a/sdk/python/kubeflow/trainer/Dockerfile +++ b/sdk/python/kubeflow/trainer/Dockerfile @@ -1,5 +1,5 @@ # Use an official Pytorch runtime as a parent image -FROM nvcr.io/nvidia/pytorch:23.10-py3 +FROM nvcr.io/nvidia/pytorch:24.06-py3 # Set the working directory in the container WORKDIR /app diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 4ca2ac1e0f..6871a5cec1 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -28,17 +28,26 @@ logger.setLevel(logging.INFO) -def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): +def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, num_labels): # Set up the model and tokenizer parsed_uri = urlparse(model_uri) model_name = parsed_uri.netloc + parsed_uri.path - model = transformer_type.from_pretrained( - pretrained_model_name_or_path=model_name, - cache_dir=model_dir, - local_files_only=True, - trust_remote_code=True, - ) + if num_labels > 0: + model = transformer_type.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=model_dir, + local_files_only=True, + trust_remote_code=True, + num_labels=num_labels, + ) + else: + model = transformer_type.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=model_dir, + local_files_only=True, + trust_remote_code=True, + ) tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path=model_name, @@ -151,6 +160,7 @@ def parse_arguments(): parser.add_argument("--model_uri", help="model uri") parser.add_argument("--transformer_type", help="model transformer type") + parser.add_argument("--num_labels", help="number of classes") parser.add_argument("--model_dir", help="directory containing model") parser.add_argument("--dataset_dir", help="directory containing dataset") parser.add_argument("--lora_config", help="lora_config") @@ -177,9 +187,14 @@ def parse_arguments(): transformer_type = getattr(transformers, args.transformer_type) logger.info("Setup model and tokenizer") - model, tokenizer = setup_model_and_tokenizer( - args.model_uri, transformer_type, args.model_dir - ) + if args.num_labels != "None": + model, tokenizer = setup_model_and_tokenizer( + args.model_uri, transformer_type, args.model_dir, int(args.num_labels) + ) + else: + model, tokenizer = setup_model_and_tokenizer( + args.model_uri, transformer_type, args.model_dir, 0 + ) logger.info("Preprocess dataset") train_data, eval_data = load_and_preprocess_data( diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index 868908f009..dd45da06fa 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -265,6 +265,8 @@ def train( model_provider_parameters.model_uri, "--transformer_type", model_provider_parameters.transformer_type.__name__, + "--num_labels", + str(model_provider_parameters.num_labels), "--model_dir", VOLUME_PATH_MODEL, "--dataset_dir", From c0406d43b407ac86ec134eae6a3d19bba55ad1df Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 21 Aug 2024 11:06:46 +0800 Subject: [PATCH 2/5] rerun tests Signed-off-by: helenxie-bit --- sdk/python/kubeflow/trainer/hf_llm_training.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 6871a5cec1..163425e69c 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -187,13 +187,13 @@ def parse_arguments(): transformer_type = getattr(transformers, args.transformer_type) logger.info("Setup model and tokenizer") - if args.num_labels != "None": + if args.num_labels == "None": model, tokenizer = setup_model_and_tokenizer( - args.model_uri, transformer_type, args.model_dir, int(args.num_labels) + args.model_uri, transformer_type, args.model_dir, 0 ) else: model, tokenizer = setup_model_and_tokenizer( - args.model_uri, transformer_type, args.model_dir, 0 + args.model_uri, transformer_type, args.model_dir, int(args.num_labels) ) logger.info("Preprocess dataset") From b9dd5923088ef41af5e23e968750203a520702dc Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 22 Aug 2024 03:11:21 +0800 Subject: [PATCH 3/5] update the process of num_labels in trainer Signed-off-by: helenxie-bit --- sdk/python/kubeflow/trainer/hf_llm_training.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 163425e69c..adbda5ee89 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -33,13 +33,13 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, num_labels parsed_uri = urlparse(model_uri) model_name = parsed_uri.netloc + parsed_uri.path - if num_labels > 0: + if num_labels != "None" and num_labels is not None: model = transformer_type.from_pretrained( pretrained_model_name_or_path=model_name, cache_dir=model_dir, local_files_only=True, trust_remote_code=True, - num_labels=num_labels, + num_labels=int(num_labels), ) else: model = transformer_type.from_pretrained( @@ -160,7 +160,7 @@ def parse_arguments(): parser.add_argument("--model_uri", help="model uri") parser.add_argument("--transformer_type", help="model transformer type") - parser.add_argument("--num_labels", help="number of classes") + parser.add_argument("--num_labels", default=None, help="number of classes") parser.add_argument("--model_dir", help="directory containing model") parser.add_argument("--dataset_dir", help="directory containing dataset") parser.add_argument("--lora_config", help="lora_config") @@ -187,14 +187,9 @@ def parse_arguments(): transformer_type = getattr(transformers, args.transformer_type) logger.info("Setup model and tokenizer") - if args.num_labels == "None": - model, tokenizer = setup_model_and_tokenizer( - args.model_uri, transformer_type, args.model_dir, 0 - ) - else: - model, tokenizer = setup_model_and_tokenizer( - args.model_uri, transformer_type, args.model_dir, int(args.num_labels) - ) + model, tokenizer = setup_model_and_tokenizer( + args.model_uri, transformer_type, args.model_dir, args.num_labels + ) logger.info("Preprocess dataset") train_data, eval_data = load_and_preprocess_data( From 56f112b9655ab14d4d272713f32c59a863cd6a03 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 22 Aug 2024 15:52:20 +0800 Subject: [PATCH 4/5] rerun tests Signed-off-by: helenxie-bit --- sdk/python/kubeflow/trainer/hf_llm_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index adbda5ee89..b516b4a4d1 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -33,7 +33,7 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, num_labels parsed_uri = urlparse(model_uri) model_name = parsed_uri.netloc + parsed_uri.path - if num_labels != "None" and num_labels is not None: + if num_labels is not None and num_labels != "None": model = transformer_type.from_pretrained( pretrained_model_name_or_path=model_name, cache_dir=model_dir, From f7ef5206616dd3c67fcbb2279b865b7e6c28a3cd Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 28 Aug 2024 06:33:27 +0800 Subject: [PATCH 5/5] adjust the default value of 'num_labels' Signed-off-by: helenxie-bit --- sdk/python/kubeflow/trainer/hf_llm_training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index b516b4a4d1..634800164e 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -33,7 +33,7 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, num_labels parsed_uri = urlparse(model_uri) model_name = parsed_uri.netloc + parsed_uri.path - if num_labels is not None and num_labels != "None": + if num_labels != "None": model = transformer_type.from_pretrained( pretrained_model_name_or_path=model_name, cache_dir=model_dir, @@ -160,7 +160,7 @@ def parse_arguments(): parser.add_argument("--model_uri", help="model uri") parser.add_argument("--transformer_type", help="model transformer type") - parser.add_argument("--num_labels", default=None, help="number of classes") + parser.add_argument("--num_labels", default="None", help="number of classes") parser.add_argument("--model_dir", help="directory containing model") parser.add_argument("--dataset_dir", help="directory containing dataset") parser.add_argument("--lora_config", help="lora_config")