forked from kubeflow/training-operator
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
downgrading pytorch version, removing changes for running things loca…
…lly, adding jupyter notebook
- Loading branch information
1 parent
112c581
commit 1034403
Showing
7 changed files
with
170 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# install kubeflow-training extra 'huggingface'\n", | ||
"!pip install -U 'kubeflow-training[huggingface]'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 15, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# import the libraries\n", | ||
"from kubeflow.training.api.training_client import TrainingClient\n", | ||
"from kubeflow.storage_initializer.hugging_face import (\n", | ||
" HuggingFaceModelParams,\n", | ||
" HuggingFaceTrainParams,\n", | ||
" HfDatasetParams,\n", | ||
")\n", | ||
"from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH\n", | ||
"from peft import LoraConfig\n", | ||
"import transformers\n", | ||
"from transformers import TrainingArguments\n", | ||
"from kubeflow.training import constants" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 16, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# create a training client, pass config_file parameter if you want to use kubeconfig other than \"~/.kube/config\"\n", | ||
"client = TrainingClient()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# mention the model, datasets and training parameters\n", | ||
"client.train(\n", | ||
" name=\"huggingface-test\",\n", | ||
" num_workers=2,\n", | ||
" num_procs_per_worker=1,\n", | ||
" # specify the storage class if you don't want to use the default one for the storage-initializer PVC\n", | ||
" # storage_config={\n", | ||
" # \"size\": \"10Gi\",\n", | ||
" # \"storage_class\": \"<your storage class>\",\n", | ||
" # },\n", | ||
" model_provider_parameters=HuggingFaceModelParams(\n", | ||
" model_uri=\"hf://TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n", | ||
" transformer_type=transformers.AutoModelForCausalLM,\n", | ||
" ),\n", | ||
" # it is assumed for text related tasks, you have 'text' column in the dataset.\n", | ||
" # for more info on how dataset is loaded check load_and_preprocess_data function in sdk/python/kubeflow/trainer/hf_llm_training.py\n", | ||
" dataset_provider_parameters=HfDatasetParams(repo_id=\"imdatta0/ultrachat_1k\"),\n", | ||
" train_parameters=HuggingFaceTrainParams(\n", | ||
" lora_config=LoraConfig(\n", | ||
" r=8,\n", | ||
" lora_alpha=8,\n", | ||
" lora_dropout=0.1,\n", | ||
" bias=\"none\",\n", | ||
" task_type=\"CAUSAL_LM\",\n", | ||
" ),\n", | ||
" training_parameters=TrainingArguments(\n", | ||
" num_train_epochs=1,\n", | ||
" per_device_train_batch_size=4,\n", | ||
" gradient_accumulation_steps=4,\n", | ||
" gradient_checkpointing=True,\n", | ||
" warmup_steps=0.02,\n", | ||
" learning_rate=1,\n", | ||
" lr_scheduler_type=\"cosine\",\n", | ||
" bf16=False,\n", | ||
" logging_steps=0.01,\n", | ||
" output_dir=INIT_CONTAINER_MOUNT_PATH,\n", | ||
" optim=f\"sgd\",\n", | ||
" save_steps=0.01,\n", | ||
" save_total_limit=3,\n", | ||
" disable_tqdm=False,\n", | ||
" resume_from_checkpoint=True,\n", | ||
" remove_unused_columns=True,\n", | ||
" ),\n", | ||
" ),\n", | ||
" resources_per_worker={\n", | ||
" \"gpu\": 1,\n", | ||
" \"cpu\": 8,\n", | ||
" \"memory\": \"16Gi\",\n", | ||
" }, # remove the gpu key if you don't want to attach gpus to the pods\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# check the logs of the job\n", | ||
"client.get_job_logs(name=\"huggingface-test\", job_kind=constants.PYTORCHJOB_KIND)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "myenv3.11", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,18 @@ | ||
# Use an official Pytorch runtime as a parent image | ||
FROM nvcr.io/nvidia/pytorch:23.12-py3 | ||
FROM nvcr.io/nvidia/pytorch:23.10-py3 | ||
|
||
# Set the working directory in the container | ||
WORKDIR /app | ||
# Set the working directory in the container | ||
WORKDIR /app | ||
|
||
# Copy the Python package and its source code into the container | ||
COPY . /app | ||
# Copy the Python package and its source code into the container | ||
COPY . /app | ||
|
||
# Copy the requirements.txt file into the container | ||
# Copy the requirements.txt file into the container | ||
COPY requirements.txt /app/requirements.txt | ||
|
||
# Install any needed packages specified in requirements.txt | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
# Install any needed packages specified in requirements.txt | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
# Run storage.py when the container launches | ||
ENTRYPOINT ["torchrun", "hf_llm_training.py"] | ||
# Run storage.py when the container launches | ||
ENTRYPOINT ["torchrun", "hf_llm_training.py"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
peft==0.7.0 | ||
peft>=0.3.0 | ||
datasets==2.15.0 | ||
transformers==4.35.2 | ||
transformers>=4.20.0 | ||
bitsandbytes>=0.42.0 | ||
einops>=0.6.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters