diff --git a/examples/sdk/train_api.ipynb b/examples/sdk/train_api.ipynb new file mode 100644 index 0000000000..76c74a0354 --- /dev/null +++ b/examples/sdk/train_api.ipynb @@ -0,0 +1,133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# install kubeflow-training extra 'huggingface'\n", + "!pip install -U 'kubeflow-training[huggingface]'" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# import the libraries\n", + "from kubeflow.training.api.training_client import TrainingClient\n", + "from kubeflow.storage_initializer.hugging_face import (\n", + " HuggingFaceModelParams,\n", + " HuggingFaceTrainParams,\n", + " HfDatasetParams,\n", + ")\n", + "from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH\n", + "from peft import LoraConfig\n", + "import transformers\n", + "from transformers import TrainingArguments\n", + "from kubeflow.training import constants" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# create a training client, pass config_file parameter if you want to use kubeconfig other than \"~/.kube/config\"\n", + "client = TrainingClient()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# mention the model, datasets and training parameters\n", + "client.train(\n", + " name=\"huggingface-test\",\n", + " num_workers=2,\n", + " num_procs_per_worker=1,\n", + " # specify the storage class if you don't want to use the default one for the storage-initializer PVC\n", + " # storage_config={\n", + " # \"size\": \"10Gi\",\n", + " # \"storage_class\": \"\",\n", + " # },\n", + " model_provider_parameters=HuggingFaceModelParams(\n", + " model_uri=\"hf://TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n", + " transformer_type=transformers.AutoModelForCausalLM,\n", + " ),\n", + " # it is assumed for text related tasks, you have 'text' column in the dataset.\n", + " # for more info on how dataset is loaded check load_and_preprocess_data function in sdk/python/kubeflow/trainer/hf_llm_training.py\n", + " dataset_provider_parameters=HfDatasetParams(repo_id=\"imdatta0/ultrachat_1k\"),\n", + " train_parameters=HuggingFaceTrainParams(\n", + " lora_config=LoraConfig(\n", + " r=8,\n", + " lora_alpha=8,\n", + " lora_dropout=0.1,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\",\n", + " ),\n", + " training_parameters=TrainingArguments(\n", + " num_train_epochs=1,\n", + " per_device_train_batch_size=4,\n", + " gradient_accumulation_steps=4,\n", + " gradient_checkpointing=True,\n", + " warmup_steps=0.02,\n", + " learning_rate=1,\n", + " lr_scheduler_type=\"cosine\",\n", + " bf16=False,\n", + " logging_steps=0.01,\n", + " output_dir=INIT_CONTAINER_MOUNT_PATH,\n", + " optim=f\"sgd\",\n", + " save_steps=0.01,\n", + " save_total_limit=3,\n", + " disable_tqdm=False,\n", + " resume_from_checkpoint=True,\n", + " remove_unused_columns=True,\n", + " ),\n", + " ),\n", + " resources_per_worker={\n", + " \"gpu\": 1,\n", + " \"cpu\": 8,\n", + " \"memory\": \"16Gi\",\n", + " }, # remove the gpu key if you don't want to attach gpus to the pods\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# check the logs of the job\n", + "client.get_job_logs(name=\"huggingface-test\", job_kind=constants.PYTORCHJOB_KIND)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "myenv3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/sdk/train_api.py b/examples/sdk/train_api.py deleted file mode 100644 index e71c1c7fd0..0000000000 --- a/examples/sdk/train_api.py +++ /dev/null @@ -1,57 +0,0 @@ -from kubeflow.training.api.training_client import TrainingClient -from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceModelParams, - HuggingFaceTrainParams, - HfDatasetParams, -) -from peft import LoraConfig -import transformers -from transformers import TrainingArguments - -client = TrainingClient() - -client.train( - name="hf-test", - num_workers=2, - num_procs_per_worker=0, - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://Jedalc/codeparrot-gp2-finetune", - transformer_type=transformers.AutoModelForCausalLM, - ), - dataset_provider_parameters=HfDatasetParams(repo_id="imdatta0/ultrachat_10k"), - train_parameters=HuggingFaceTrainParams( - lora_config=LoraConfig( - r=8, - lora_alpha=8, - target_modules=["c_attn", "c_proj", "w1", "w2"], - layers_to_transform=list(range(30, 40)), - # layers_pattern=['lm_head'], - lora_dropout=0.1, - bias="none", - task_type="CAUSAL_LM", - ), - training_parameters=TrainingArguments( - num_train_epochs=2, - per_device_train_batch_size=1, - gradient_accumulation_steps=1, - gradient_checkpointing=True, - warmup_steps=0.01, - # max_steps=50, #20, - learning_rate=1, - lr_scheduler_type="cosine", - bf16=False, - logging_steps=0.01, - output_dir="", - optim=f"paged_adamw_32bit", - save_steps=0.01, - save_total_limit=3, - disable_tqdm=False, - resume_from_checkpoint=True, - remove_unused_columns=True, - evaluation_strategy="steps", - eval_steps=0.01, - per_device_eval_batch_size=1, - ), - ), - resources_per_worker={"gpu": 0, "cpu": 8, "memory": "8Gi"}, -) diff --git a/sdk/python/kubeflow/trainer/hf_dockerfile b/sdk/python/kubeflow/trainer/hf_dockerfile index f0ddd8f9c2..d82b715552 100644 --- a/sdk/python/kubeflow/trainer/hf_dockerfile +++ b/sdk/python/kubeflow/trainer/hf_dockerfile @@ -1,18 +1,18 @@ # Use an official Pytorch runtime as a parent image -FROM nvcr.io/nvidia/pytorch:23.12-py3 +FROM nvcr.io/nvidia/pytorch:23.10-py3 - # Set the working directory in the container - WORKDIR /app +# Set the working directory in the container +WORKDIR /app - # Copy the Python package and its source code into the container - COPY . /app +# Copy the Python package and its source code into the container +COPY . /app - # Copy the requirements.txt file into the container +# Copy the requirements.txt file into the container COPY requirements.txt /app/requirements.txt - # Install any needed packages specified in requirements.txt - RUN pip install --no-cache-dir -r requirements.txt +# Install any needed packages specified in requirements.txt +RUN pip install --no-cache-dir -r requirements.txt - # Run storage.py when the container launches - ENTRYPOINT ["torchrun", "hf_llm_training.py"] +# Run storage.py when the container launches +ENTRYPOINT ["torchrun", "hf_llm_training.py"] \ No newline at end of file diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 2bccc0ac4a..26c48c08dd 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -16,9 +16,8 @@ import json -def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): +def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, train_args): # Set up the model and tokenizer - parsed_uri = urlparse(model_uri) model_name = parsed_uri.netloc + parsed_uri.path transformer_type_class = getattr(transformers, transformer_type) @@ -28,6 +27,7 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): cache_dir=model_dir, local_files_only=True, device_map="auto", + trust_remote_code=True, ) tokenizer = transformers.AutoTokenizer.from_pretrained( @@ -47,16 +47,24 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): return model, tokenizer -def load_and_preprocess_data(dataset_name, dataset_dir): +def load_and_preprocess_data(dataset_name, dataset_dir, transformer_type, tokenizer): # Load and preprocess the dataset print("loading dataset") - dataset = load_dataset(dataset_name, cache_dir=dataset_dir) + transformer_type_class = getattr(transformers, transformer_type) + if transformer_type_class != transformers.AutoModelForImageClassification: + dataset = load_dataset(dataset_name, cache_dir=dataset_dir).map( + lambda x: tokenizer(x["text"]), batched=True + ) + else: + dataset = load_dataset(dataset_name, cache_dir=dataset_dir) + train_data = dataset["train"] try: eval_data = dataset["eval"] except Exception as err: eval_data = None + print("Evaluation dataset is not found") return train_data, eval_data @@ -64,26 +72,23 @@ def load_and_preprocess_data(dataset_name, dataset_dir): def setup_peft_model(model, lora_config): # Set up the PEFT model lora_config = LoraConfig(**json.loads(lora_config)) - print(lora_config) + model.enable_input_require_grads() model = get_peft_model(model, lora_config) return model -def train_model(model, train_data, eval_data, tokenizer, train_params): +def train_model(model, train_data, eval_data, tokenizer, train_args): # Train the model trainer = Trainer( model=model, train_dataset=train_data, eval_dataset=eval_data, tokenizer=tokenizer, - args=TrainingArguments( - **train_params, - data_collator=DataCollatorForLanguageModeling( - tokenizer, pad_to_multiple_of=8, return_tensors="pt", mlm=False - ) + args=train_args, + data_collator=DataCollatorForLanguageModeling( + tokenizer, pad_to_multiple_of=8, mlm=False ), ) - trainer.train() print("training done") @@ -108,11 +113,12 @@ def parse_arguments(): if __name__ == "__main__": args = parse_arguments() + train_args = TrainingArguments(**json.loads(args.training_parameters)) model, tokenizer = setup_model_and_tokenizer( - args.model_uri, args.transformer_type, args.model_dir + args.model_uri, args.transformer_type, args.model_dir, train_args ) train_data, eval_data = load_and_preprocess_data( - args.dataset_name, args.dataset_dir + args.dataset_name, args.dataset_dir, args.transformer_type, tokenizer ) model = setup_peft_model(model, args.lora_config) - train_model(model, train_data, eval_data, tokenizer, args.training_parameters) + train_model(model, train_data, eval_data, tokenizer, train_args) diff --git a/sdk/python/kubeflow/trainer/requirements.txt b/sdk/python/kubeflow/trainer/requirements.txt index e4c4b2b6c3..f342311be0 100644 --- a/sdk/python/kubeflow/trainer/requirements.txt +++ b/sdk/python/kubeflow/trainer/requirements.txt @@ -1,3 +1,5 @@ -peft==0.7.0 +peft>=0.3.0 datasets==2.15.0 -transformers==4.35.2 \ No newline at end of file +transformers>=4.20.0 +bitsandbytes>=0.42.0 +einops>=0.6.1 diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index 37c782c5c5..a8187de7e0 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -171,8 +171,7 @@ def train( ), ) except Exception as e: - pass # local - # raise RuntimeError("failed to create pvc") + raise RuntimeError("failed to create pvc") if isinstance(model_provider_parameters, HuggingFaceModelParams): mp = "hf" diff --git a/sdk/python/kubeflow/training/utils/utils.py b/sdk/python/kubeflow/training/utils/utils.py index 09130a4de1..655839225b 100644 --- a/sdk/python/kubeflow/training/utils/utils.py +++ b/sdk/python/kubeflow/training/utils/utils.py @@ -131,7 +131,6 @@ def get_container_spec( raise ValueError("container name or image cannot be none") container_spec = models.V1Container(name=name, image=image) - container_spec.image_pull_policy = "Always" if args: container_spec.args = args @@ -175,8 +174,7 @@ def get_pod_template_spec( name=constants.JOB_PARAMETERS[job_kind]["container"], image=base_image, ) - ], - image_pull_secrets=[models.V1LocalObjectReference(name="regcred")], + ] ), )