From f7a88d143448434de993c64100763720657b656b Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Fri, 1 Nov 2024 05:36:21 -0700 Subject: [PATCH] NeMo-UX: Mistral/mixtral peft ci test (#11094) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add mistral/mixtral peft ci test Signed-off-by: Alexandros Koumparoulis * add mistral/mixtral peft ci test Signed-off-by: Alexandros Koumparoulis * add mistral tp2 Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * add tests to NEMO_CICD_Test Signed-off-by: Alexandros Koumparoulis * Update .github/workflows/cicd-main.yml Co-authored-by: oliver könig Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> * fix params Signed-off-by: Alexandros Koumparoulis * rm devices arg Signed-off-by: Alexandros Koumparoulis * add --dist-opt arg Signed-off-by: Alexandros Koumparoulis * add tp=2 mixtral Signed-off-by: Alexandros Koumparoulis * add ep test Signed-off-by: Alexandros Koumparoulis * fix Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Co-authored-by: akoumpa Co-authored-by: oliver könig --- .github/workflows/cicd-main.yml | 80 ++++++++++++++ tests/collections/llm/lora_mistralai.py | 139 ++++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 tests/collections/llm/lora_mistralai.py diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index bb239acb00fc..2bdbe673d19b 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4266,6 +4266,81 @@ jobs: --pp_size 1 \ --mbs 1 --packed + L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + + python tests/collections/llm/lora_mistralai.py \ + --max-steps 3 \ + --ep 1 \ + --mbs 2 \ + --model mixtral + + L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + + python tests/collections/llm/lora_mistralai.py \ + --max-steps 3 \ + --tp 1 \ + --mbs 1 \ + --model mixtral \ + --dist-opt + + L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + + python tests/collections/llm/lora_mistralai.py \ + --max-steps 3 \ + --tp 2 \ + --mbs 1 \ + --model mixtral \ + --dist-opt + + L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + + python tests/collections/llm/lora_mistralai.py \ + --max-steps 3 \ + --tp 1 \ + --mbs 1 \ + --model mistral \ + --dist-opt + + L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + + python tests/collections/llm/lora_mistralai.py \ + --max-steps 3 \ + --tp 2 \ + --mbs 1 \ + --model mistral \ + --dist-opt + + L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4422,6 +4497,11 @@ jobs: - L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2 - L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2 - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED + - L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2 + - L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1 + - L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1 + - L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1 + - L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1 - L2_NeMo_2_Mixtral_Pretraining - L2_PTQ_Llama2_FP8 - L2_Community_LLM_Checkpoints_tests_Llama3 diff --git a/tests/collections/llm/lora_mistralai.py b/tests/collections/llm/lora_mistralai.py new file mode 100644 index 000000000000..09a52668e3ee --- /dev/null +++ b/tests/collections/llm/lora_mistralai.py @@ -0,0 +1,139 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import pytorch_lightning as pl +import torch +from megatron.core.optimizer import OptimizerConfig + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.lightning.io.mixin import track_io + + +def get_args(): + parser = argparse.ArgumentParser(description='Finetune a small GPT model using NeMo 2.0') + parser.add_argument('--model', type=str.lower, choices=['mistral', 'mixtral'], help="model") + parser.add_argument('--max-steps', type=int, default=9, help="number of devices") + parser.add_argument('--mbs', type=int, default=2, help="micro batch size") + parser.add_argument('--gbs', type=int, default=4, help="global batch size") + parser.add_argument('--tp', type=int, default=1, help="tensor parallel size") + parser.add_argument('--ep', type=int, default=1, help="expert parallel size") + parser.add_argument('--dist-opt', action='store_true', help='use dist opt') + return parser.parse_args() + + +def trainer(devices, tp, ep, sp, max_steps) -> nl.Trainer: + strategy = nl.MegatronStrategy( + tensor_model_parallel_size=tp, + expert_model_parallel_size=ep, + sequence_parallel=sp, + ) + + return nl.Trainer( + devices=max(ep, tp), + max_steps=max_steps, + accelerator="gpu", + strategy=strategy, + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + log_every_n_steps=1, + limit_val_batches=0, + val_check_interval=0, + num_sanity_val_steps=0, + ) + + +@track_io +class OrdTokenizer: + def __init__(self, vocab_size=30_000, num_reserved_tokens=128, special_token_names=['bos_id', 'eos_id', 'pad_id']): + self.vocab_size = vocab_size + self.num_reserved_tokens = num_reserved_tokens + self.special_token_names = special_token_names + assert len(self.special_token_names) < num_reserved_tokens + + def __getattr__(self, name): + if name in self.__dict__.get('special_token_names', {}): + return self.__dict__['special_token_names'].index(name) + elif name in self.__dict__: + return self.__dict__[name] + else: + raise AttributeError + + def text_to_ids(self, text): + token_ids = list(map(lambda x: self.num_reserved_tokens + ord(x), list(text))) + assert max(token_ids) < self.vocab_size + return token_ids + + +def logger() -> nl.NeMoLogger: + ckpt = nl.ModelCheckpoint( + save_last=True, + every_n_train_steps=10, + monitor="reduced_train_loss", + save_top_k=1, + save_on_train_epoch_end=True, + save_optim_on_train_end=True, + ) + + return nl.NeMoLogger( + name="nemo2_peft", + log_dir="/tmp/peft_logs", + use_datetime_version=False, # must be false if using auto resume + ckpt=ckpt, + wandb=None, + ) + + +def squad(mbs, gbs) -> pl.LightningDataModule: + return llm.SquadDataModule(seq_length=2048, micro_batch_size=mbs, global_batch_size=gbs, num_workers=0) + + +def mixtral_8x7b() -> pl.LightningModule: + tokenizer = OrdTokenizer() + model = llm.MixtralModel(llm.MixtralConfig8x7B(num_layers=2), tokenizer=tokenizer) + lora = llm.peft.LoRA() + return model, lora + + +def mistral_7b() -> pl.LightningModule: + tokenizer = OrdTokenizer() + model = llm.MistralModel(llm.MistralConfig7B(num_layers=2), tokenizer=tokenizer) + lora = llm.peft.LoRA() + return model, lora + + +if __name__ == '__main__': + args = get_args() + if args.model == 'mistral': + model, lora = mistral_7b() + else: + model, lora = mixtral_8x7b() + llm.finetune( + model=model, + data=squad(args.mbs, args.gbs), + trainer=trainer(args.tp, args.tp, args.ep, args.tp > 1, args.max_steps), + peft=lora, + log=logger(), + optim=nl.MegatronOptimizerModule( + config=OptimizerConfig( + optimizer="adam", + lr=0.0001, + adam_beta2=0.98, + use_distributed_optimizer=args.dist_opt, + clip_grad=1.0, + bf16=True, + ), + ), + )