Skip to content

Commit

Permalink
Merge branch 'main' into chcui/tokenizer_exception
Browse files Browse the repository at this point in the history
  • Loading branch information
cuichenx authored Nov 22, 2024
2 parents c197aed + 400dd84 commit 537c564
Show file tree
Hide file tree
Showing 14 changed files with 343 additions and 130 deletions.
1 change: 1 addition & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4493,6 +4493,7 @@ jobs:
- L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1
- L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1
- L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1
- L2_NEMO_2_LoRA_MERGE
- L2_NeMo_2_Mixtral_Pretraining
- L2_PTQ_Llama2_FP8
- L2_Community_LLM_Checkpoints_tests_Llama3
Expand Down
2 changes: 0 additions & 2 deletions examples/asr/transcribe_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,6 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis

# we will adjust this flag if the model does not support it
compute_langs = cfg.compute_langs
if cfg.timestamps:
cfg.return_hypotheses = True

# Check whether model and decoder type match
if isinstance(asr_model, EncDecCTCModel):
Expand Down
1 change: 0 additions & 1 deletion nemo/collections/asr/models/ctc_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ def transcribe(
A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as
paths2audio_files
"""
timestamps = timestamps or override_config.timestamps if override_config is not None else None
if timestamps is not None:
# else retain the decoder state (users can set it using change_decoding_strategy)
if timestamps or (override_config is not None and override_config.timestamps):
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/asr/models/rnnt_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def transcribe(
* A list of greedy transcript texts / Hypothesis
* An optional list of beam search transcript texts / Hypothesis / NBestHypothesis.
"""
timestamps = timestamps or override_config.timestamps if override_config is not None else None

if timestamps is not None:
if timestamps or (override_config is not None and override_config.timestamps):
logging.info(
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/llm/gpt/model/ssm.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ class BaseMambaConfig2_7B(SSMConfig):
@dataclass
class NVIDIAMambaConfig8B(SSMConfig):
hybrid_override_pattern: str = "M" * 56
num_attention_heads: int = 32
num_layers: int = 56
seq_length: int = 4096
hidden_size: int = 4096
Expand Down
64 changes: 47 additions & 17 deletions nemo/collections/llm/recipes/mamba2_130m.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
)


@run.cli.factory(target=finetune, name=NAME)
def trainer(
tensor_parallelism: int = 1,
pipeline_parallelism: int = 1,
Expand All @@ -76,7 +77,11 @@ def trainer(
sequence_parallelism: bool = False,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
max_steps: int = 1168251,
max_steps: int = 100,
val_check_interval: int = 100,
limit_test_batches: int = 50,
limit_val_batches: int = 32,
log_every_n_steps: int = 10,
callbacks: Optional[list[run.Config[Callback]]] = None,
) -> run.Config[nl.Trainer]:
"""
Expand Down Expand Up @@ -137,15 +142,15 @@ def trainer(
accumulate_grad_batches=1,
callbacks=callbacks,
devices=num_gpus_per_node,
limit_test_batches=50,
limit_val_batches=32,
log_every_n_steps=10,
max_steps=max_steps,
num_nodes=num_nodes,
plugins=bf16_mixed(),
strategy=strategy,
use_distributed_sampler=False,
val_check_interval=2000,
val_check_interval=val_check_interval,
limit_test_batches=limit_test_batches,
limit_val_batches=limit_val_batches,
log_every_n_steps=log_every_n_steps,
)

return trainer
Expand All @@ -158,6 +163,16 @@ def pretrain_recipe(
tokenizer_model: str = None,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
tensor_parallelism: int = 1,
pipeline_parallelism: int = 1,
max_steps: int = 100,
val_check_interval: int = 100,
limit_test_batches: int = 50,
limit_val_batches: int = 32,
log_every_n_steps: int = 10,
seq_length: int = 4096,
gbs: int = 8,
mbs: int = 1,
fn=pretrain,
) -> run.Partial:
"""
Expand Down Expand Up @@ -193,16 +208,23 @@ def pretrain_recipe(
fn,
model=model(),
trainer=trainer(
max_steps=max_steps,
num_nodes=num_nodes,
tensor_parallelism=tensor_parallelism,
pipeline_parallelism=pipeline_parallelism,
num_gpus_per_node=num_gpus_per_node,
val_check_interval=val_check_interval,
limit_test_batches=limit_test_batches,
limit_val_batches=limit_val_batches,
log_every_n_steps=log_every_n_steps,
callbacks=[run.Config(TimingCallback)],
),
data=run.Config(
MockDataModule,
seq_length=4096,
global_batch_size=8,
micro_batch_size=1,
tokenizer=tokenizer(tokenizer_model=tokenizer_model),
seq_length=seq_length,
global_batch_size=gbs,
micro_batch_size=mbs,
tokenizer=tokenizer(),
),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
Expand All @@ -218,6 +240,14 @@ def finetune_recipe(
tokenizer_model: str = None,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
tensor_model_parallel_size: int = 1,
pipeline_model_parallel_size: int = 1,
seq_length: int = 4096,
max_steps: int = 100,
val_check_interval: int = 100,
limit_test_batches: int = 50,
limit_val_batches: int = 32,
log_every_n_steps: int = 10,
gbs: int = 8,
mbs: int = 1,
peft_scheme: Optional[str] = 'none',
Expand Down Expand Up @@ -266,8 +296,8 @@ def finetune_recipe(
)
strategy = run.Config(
nl.MegatronStrategy,
tensor_model_parallel_size=1,
pipeline_model_parallel_size=1,
tensor_model_parallel_size=tensor_model_parallel_size,
pipeline_model_parallel_size=pipeline_model_parallel_size,
gradient_as_bucket_view=True,
ckpt_load_optimizer=False,
ckpt_save_optimizer=False,
Expand All @@ -283,10 +313,11 @@ def finetune_recipe(
accelerator="gpu",
accumulate_grad_batches=1,
devices=num_gpus_per_node,
limit_test_batches=10,
limit_val_batches=10,
log_every_n_steps=20,
max_steps=100,
max_steps=max_steps,
val_check_interval=val_check_interval,
limit_test_batches=limit_test_batches,
limit_val_batches=limit_val_batches,
log_every_n_steps=log_every_n_steps,
num_nodes=num_nodes,
plugins=run.Config(
nl.MegatronMixedPrecision,
Expand All @@ -296,15 +327,14 @@ def finetune_recipe(
callbacks=[checkpoint_callback],
strategy=strategy,
use_distributed_sampler=False,
val_check_interval=20,
)
recipe = run.Partial(
llm.finetune,
model=model(tokenizer_model=tokenizer_model),
trainer=trainer,
data=run.Config(
llm.SquadDataModule,
seq_length=2048,
seq_length=seq_length,
global_batch_size=gbs,
micro_batch_size=mbs,
tokenizer=tokenizer(tokenizer_model=tokenizer_model),
Expand Down
68 changes: 49 additions & 19 deletions nemo/collections/llm/recipes/mamba2_1_3b.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
)


@run.cli.factory(target=finetune, name=NAME)
def trainer(
tensor_parallelism: int = 1,
pipeline_parallelism: int = 1,
Expand All @@ -76,7 +77,11 @@ def trainer(
sequence_parallelism: bool = False,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
max_steps: int = 1168251,
max_steps: int = 100,
val_check_interval: int = 100,
limit_test_batches: int = 50,
limit_val_batches: int = 32,
log_every_n_steps: int = 10,
callbacks: Optional[list[run.Config[Callback]]] = None,
) -> run.Config[nl.Trainer]:
"""
Expand Down Expand Up @@ -137,15 +142,15 @@ def trainer(
accumulate_grad_batches=1,
callbacks=callbacks,
devices=num_gpus_per_node,
limit_test_batches=50,
limit_val_batches=32,
log_every_n_steps=10,
max_steps=max_steps,
num_nodes=num_nodes,
plugins=bf16_mixed(),
strategy=strategy,
use_distributed_sampler=False,
val_check_interval=2000,
val_check_interval=val_check_interval,
limit_test_batches=limit_test_batches,
limit_val_batches=limit_val_batches,
log_every_n_steps=log_every_n_steps,
)

return trainer
Expand All @@ -157,7 +162,17 @@ def pretrain_recipe(
name: str = "default",
tokenizer_model: str = None,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
num_gpus_per_node: int = 1,
tensor_parallelism: int = 1,
pipeline_parallelism: int = 1,
max_steps: int = 100,
val_check_interval: int = 100,
limit_test_batches: int = 50,
limit_val_batches: int = 32,
log_every_n_steps: int = 10,
seq_length: int = 4096,
gbs: int = 8,
mbs: int = 1,
fn=pretrain,
) -> run.Partial:
"""
Expand Down Expand Up @@ -191,17 +206,24 @@ def pretrain_recipe(
"""
return run.Partial(
fn,
model=model(),
model=model(tokenizer_model=tokenizer_model),
trainer=trainer(
max_steps=max_steps,
num_nodes=num_nodes,
tensor_parallelism=tensor_parallelism,
pipeline_parallelism=pipeline_parallelism,
num_gpus_per_node=num_gpus_per_node,
val_check_interval=val_check_interval,
limit_test_batches=limit_test_batches,
limit_val_batches=limit_val_batches,
log_every_n_steps=log_every_n_steps,
callbacks=[run.Config(TimingCallback)],
),
data=run.Config(
MockDataModule,
seq_length=4096,
global_batch_size=8,
micro_batch_size=1,
seq_length=seq_length,
global_batch_size=gbs,
micro_batch_size=mbs,
tokenizer=tokenizer(tokenizer_model=tokenizer_model),
),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
Expand All @@ -217,7 +239,15 @@ def finetune_recipe(
resume_path: str = None,
tokenizer_model: str = None,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
num_gpus_per_node: int = 1,
tensor_model_parallel_size: int = 1,
pipeline_model_parallel_size: int = 1,
seq_length: int = 4096,
max_steps: int = 100,
val_check_interval: int = 100,
limit_test_batches: int = 50,
limit_val_batches: int = 32,
log_every_n_steps: int = 10,
gbs: int = 8,
mbs: int = 1,
peft_scheme: Optional[str] = 'none',
Expand Down Expand Up @@ -266,8 +296,8 @@ def finetune_recipe(
)
strategy = run.Config(
nl.MegatronStrategy,
tensor_model_parallel_size=1,
pipeline_model_parallel_size=1,
tensor_model_parallel_size=tensor_model_parallel_size,
pipeline_model_parallel_size=pipeline_model_parallel_size,
gradient_as_bucket_view=True,
ckpt_load_optimizer=False,
ckpt_save_optimizer=False,
Expand All @@ -283,10 +313,11 @@ def finetune_recipe(
accelerator="gpu",
accumulate_grad_batches=1,
devices=num_gpus_per_node,
limit_test_batches=10,
limit_val_batches=10,
log_every_n_steps=20,
max_steps=100,
max_steps=max_steps,
val_check_interval=val_check_interval,
limit_test_batches=limit_test_batches,
limit_val_batches=limit_val_batches,
log_every_n_steps=log_every_n_steps,
num_nodes=num_nodes,
plugins=run.Config(
nl.MegatronMixedPrecision,
Expand All @@ -296,15 +327,14 @@ def finetune_recipe(
callbacks=[checkpoint_callback],
strategy=strategy,
use_distributed_sampler=False,
val_check_interval=20,
)
recipe = run.Partial(
llm.finetune,
model=model(tokenizer_model=tokenizer_model),
trainer=trainer,
data=run.Config(
llm.SquadDataModule,
seq_length=2048,
seq_length=seq_length,
global_batch_size=gbs,
micro_batch_size=mbs,
tokenizer=tokenizer(tokenizer_model=tokenizer_model),
Expand Down
Loading

0 comments on commit 537c564

Please sign in to comment.