From a6b08a693fde417ada4711d98e405dce06c7dc2f Mon Sep 17 00:00:00 2001 From: Huy Vu <86480512+huvunvidia@users.noreply.github.com> Date: Wed, 27 Nov 2024 17:34:31 -0400 Subject: [PATCH] Huvu/t5 nemo2.0 nemoci 3b11b (#11388) * Update t5_11b.py Signed-off-by: Huy Vu <86480512+huvunvidia@users.noreply.github.com> * Update t5_3b.py Signed-off-by: Huy Vu <86480512+huvunvidia@users.noreply.github.com> * Update t5_3b.py Signed-off-by: Huy Vu <86480512+huvunvidia@users.noreply.github.com> * Update t5_3b.py Signed-off-by: Huy Vu <86480512+huvunvidia@users.noreply.github.com> --------- Signed-off-by: Huy Vu <86480512+huvunvidia@users.noreply.github.com> --- nemo/collections/llm/recipes/t5_11b.py | 18 +++++++++++------- nemo/collections/llm/recipes/t5_3b.py | 18 +++++++++++------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/nemo/collections/llm/recipes/t5_11b.py b/nemo/collections/llm/recipes/t5_11b.py index ee7323aa044f..c54bf48b9613 100644 --- a/nemo/collections/llm/recipes/t5_11b.py +++ b/nemo/collections/llm/recipes/t5_11b.py @@ -175,7 +175,8 @@ def pretrain_recipe( guide in the `examples/llm/pretrain/` directory. """ - opt_config = OptimizerConfig( + opt_config = run.Config( + OptimizerConfig, optimizer='adam', lr=0.0001, use_distributed_optimizer=True, @@ -183,7 +184,8 @@ def pretrain_recipe( weight_decay=0.01, ) - lr_scheduler = WarmupAnnealingScheduler( + lr_scheduler = run.Config( + WarmupAnnealingScheduler, warmup_steps=None, warmup_ratio=0.01, max_steps=1000000, @@ -202,7 +204,7 @@ def pretrain_recipe( MockDataModule, seq_length=512, seq_length_dec=128, global_batch_size=1920, micro_batch_size=24 ), log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), - optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler), + optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler), resume=default_resume(), ) @@ -248,15 +250,17 @@ def finetune_recipe( on fine-tuning LLMs with NeMo, see the fine-tuning guide in the `examples/llm/finetune/` directory. """ - opt_config = OptimizerConfig( + opt_config = run.Config( + OptimizerConfig, optimizer='adam', - lr=1e-4, + lr=0.0001, use_distributed_optimizer=True, bf16=True, weight_decay=0.01, ) - lr_scheduler = WarmupAnnealingScheduler( + lr_scheduler = run.Config( + WarmupAnnealingScheduler, warmup_steps=50, max_steps=2000, min_lr=0.00001, @@ -273,7 +277,7 @@ def finetune_recipe( SquadDataModule, seq_length=512, seq_length_dec=128, global_batch_size=128, micro_batch_size=1 ), log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), - optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler), + optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler), resume=nemo_resume(checkpoint_path), ) diff --git a/nemo/collections/llm/recipes/t5_3b.py b/nemo/collections/llm/recipes/t5_3b.py index 82772e1b865a..b1783594d2f7 100644 --- a/nemo/collections/llm/recipes/t5_3b.py +++ b/nemo/collections/llm/recipes/t5_3b.py @@ -175,7 +175,8 @@ def pretrain_recipe( guide in the `examples/llm/pretrain/` directory. """ - opt_config = OptimizerConfig( + opt_config = run.Config( + OptimizerConfig, optimizer='adam', lr=0.0001, use_distributed_optimizer=True, @@ -183,7 +184,8 @@ def pretrain_recipe( weight_decay=0.01, ) - lr_scheduler = WarmupAnnealingScheduler( + lr_scheduler = run.Config( + WarmupAnnealingScheduler, warmup_steps=None, warmup_ratio=0.01, max_steps=1000000, @@ -202,7 +204,7 @@ def pretrain_recipe( MockDataModule, seq_length=512, seq_length_dec=128, global_batch_size=1920, micro_batch_size=24 ), log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), - optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler), + optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler), resume=default_resume(), ) @@ -248,15 +250,17 @@ def finetune_recipe( on fine-tuning LLMs with NeMo, see the fine-tuning guide in the `examples/llm/finetune/` directory. """ - opt_config = OptimizerConfig( + opt_config = run.Config( + OptimizerConfig, optimizer='adam', - lr=1e-4, + lr=0.0001, use_distributed_optimizer=True, bf16=True, weight_decay=0.01, ) - lr_scheduler = WarmupAnnealingScheduler( + lr_scheduler = run.Config( + WarmupAnnealingScheduler, warmup_steps=50, max_steps=2000, min_lr=0.00001, @@ -273,7 +277,7 @@ def finetune_recipe( SquadDataModule, seq_length=512, seq_length_dec=128, global_batch_size=128, micro_batch_size=1 ), log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), - optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler), + optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler), resume=nemo_resume(checkpoint_path), )