Skip to content

Commit

Permalink
Merge branch 'sortformer/pr_01' of https://github.com/tango4j/NeMo in…
Browse files Browse the repository at this point in the history
…to sortformer/pr_01
  • Loading branch information
tango4j committed Nov 27, 2024
2 parents 73944e3 + 470579d commit ea4c2a7
Show file tree
Hide file tree
Showing 16 changed files with 79 additions and 48 deletions.
13 changes: 11 additions & 2 deletions .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,16 @@ jobs:
ARG=("--runtime=nvidia --gpus all")
fi
docker run --rm -d --name nemo_container_${{ github.run_id }} ${ARG[@]} --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
docker run \
--rm \
-d \
--name nemo_container_${{ github.run_id }} ${ARG[@]} \
--shm-size=64g \
--env TRANSFORMERS_OFFLINE=0 \
--env HYDRA_FULL_ERROR=1 \
--env HF_HOME=/home/TestData/HF_HOME \
--volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container:${{ github.run_id }} \
bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
- id: main
name: Run main script
Expand Down Expand Up @@ -95,4 +104,4 @@ jobs:
if: always()
run: |
docker container stop nemo_container_${{ github.run_id }} || true
docker container rm nemo_container_${{ github.run_id }} || true
docker container rm nemo_container_${{ github.run_id }} || true
6 changes: 3 additions & 3 deletions examples/llm/peft/hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,11 @@ def formatting_prompts_func(examples):
# See: https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81
grad_clip = None
use_dist_samp = False
tokenizer = llm.HfAutoModelForCausalLM.configure_tokenizer(args.model)
tokenizer = llm.HFAutoModelForCausalLM.configure_tokenizer(args.model)

llm.api.finetune(
model=llm.HfAutoModelForCausalLM(args.model),
data=llm.HfDatasetDataModule(
model=llm.HFAutoModelForCausalLM(args.model),
data=llm.HFDatasetDataModule(
mk_hf_dataset(tokenizer.tokenizer), pad_token_id=tokenizer.tokenizer.eos_token_id
),
trainer=nl.Trainer(
Expand Down
2 changes: 1 addition & 1 deletion examples/llm/sft/hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def squad(tokenizer) -> pl.LightningDataModule:

from nemo.lightning.pytorch.accelerate.transformer_engine import te_accelerate

model = llm.HfAutoModelForCausalLM(model_name=args.model, model_accelerator=model_accelerator)
model = llm.HFAutoModelForCausalLM(model_name=args.model, model_accelerator=model_accelerator)
tokenizer = model.tokenizer

llm.api.finetune(
Expand Down
6 changes: 3 additions & 3 deletions nemo/collections/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
AlpacaDataModule,
DollyDataModule,
FineTuningDataModule,
HfDatasetDataModule,
HFDatasetDataModule,
MockDataModule,
PreTrainingDataModule,
SquadDataModule,
Expand Down Expand Up @@ -64,7 +64,7 @@
GPTConfig126M,
GPTConfig175B,
GPTModel,
HfAutoModelForCausalLM,
HFAutoModelForCausalLM,
Llama2Config7B,
Llama2Config13B,
Llama2Config70B,
Expand Down Expand Up @@ -218,7 +218,7 @@
"dolly",
"peft",
"hf_dataset",
"HfAutoModelForCausalLM",
"HFAutoModelForCausalLM",
]


Expand Down
4 changes: 2 additions & 2 deletions nemo/collections/llm/gpt/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from nemo.collections.llm.gpt.data.alpaca import AlpacaDataModule
from nemo.collections.llm.gpt.data.dolly import DollyDataModule
from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
from nemo.collections.llm.gpt.data.hf_dataset import HfDatasetDataModule
from nemo.collections.llm.gpt.data.hf_dataset import HFDatasetDataModule
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule, build_pretraining_datamodule
from nemo.collections.llm.gpt.data.squad import SquadDataModule
Expand All @@ -28,5 +28,5 @@
"MockDataModule",
"PreTrainingDataModule",
"build_pretraining_datamodule",
"HfDatasetDataModule",
"HFDatasetDataModule",
]
4 changes: 2 additions & 2 deletions nemo/collections/llm/gpt/data/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import nemo_run as run

from nemo.collections.llm.gpt.data.dolly import DollyDataModule
from nemo.collections.llm.gpt.data.hf_dataset import HfDatasetDataModule
from nemo.collections.llm.gpt.data.hf_dataset import HFDatasetDataModule
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.gpt.data.squad import SquadDataModule

Expand All @@ -42,7 +42,7 @@ def dolly() -> pl.LightningDataModule:
@run.cli.factory
@run.autoconvert
def hf_dataset(dataset: str) -> pl.LightningDataModule:
return HfDatasetDataModule(dataset=dataset, global_batch_size=16, micro_batch_size=2)
return HFDatasetDataModule(dataset=dataset, global_batch_size=16, micro_batch_size=2)


__all__ = ["mock", "squad", "dolly", "hf_dataset"]
4 changes: 2 additions & 2 deletions nemo/collections/llm/gpt/data/hf_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from nemo.lightning.pytorch.plugins import MegatronDataSampler


class HfDatasetDataModule(pl.LightningDataModule):
class HFDatasetDataModule(pl.LightningDataModule):
def __init__(
self,
dataset,
Expand Down Expand Up @@ -88,7 +88,7 @@ def train_dataloader(self, collate_fn=None):
from nemo.lightning.data import add_megatron_sampler

if collate_fn is None:
collate_fn = lambda x: HfDatasetDataModule.collate_fn(x, pad_token_id=self.pad_token_id)
collate_fn = lambda x: HFDatasetDataModule.collate_fn(x, pad_token_id=self.pad_token_id)

return DataLoader(
self.dataset,
Expand Down
4 changes: 2 additions & 2 deletions nemo/collections/llm/gpt/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
Gemma2Config27B,
Gemma2Model,
)
from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HfAutoModelForCausalLM
from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HFAutoModelForCausalLM
from nemo.collections.llm.gpt.model.llama import (
CodeLlamaConfig7B,
CodeLlamaConfig13B,
Expand Down Expand Up @@ -191,5 +191,5 @@
"transformer_engine_layer_spec",
"transformer_engine_full_layer_spec",
"local_layer_spec",
"HfAutoModelForCausalLM",
"HFAutoModelForCausalLM",
]
4 changes: 2 additions & 2 deletions nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def masked_cross_entropy(logits, targets, mask=None):
return F.cross_entropy(logits, targets)


class HfAutoModelForCausalLM(pl.LightningModule, io.IOMixin, fn.FNMixin):
class HFAutoModelForCausalLM(pl.LightningModule, io.IOMixin, fn.FNMixin):
def __init__(
self,
model_name='gpt2',
Expand All @@ -57,7 +57,7 @@ def __init__(
@property
def tokenizer(self):
if self._tokenizer is None:
self._tokenizer = HfAutoModelForCausalLM.configure_tokenizer(self.model_name, self.trust_remote_code)
self._tokenizer = HFAutoModelForCausalLM.configure_tokenizer(self.model_name, self.trust_remote_code)
return self._tokenizer

@tokenizer.setter
Expand Down
20 changes: 10 additions & 10 deletions nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from nemo import lightning as nl
from nemo.collections.llm.api import finetune, pretrain
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HfAutoModelForCausalLM
from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HFAutoModelForCausalLM
from nemo.collections.llm.peft.lora import LoRA
from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
from nemo.collections.llm.recipes.optim.adam import pytorch_adam_with_cosine_annealing
Expand All @@ -35,23 +35,23 @@
@run.cli.factory(name=NAME)
def model(model_name, load_pretrained_weights) -> run.Config[pl.LightningModule]:
"""
Factory function to create HfAutoModelForCausalLM model configurations.
Factory function to create HFAutoModelForCausalLM model configurations.
Args:
model_name (str): Model id on HF.
Returns:
run.Config[pl.LightningModule]: Configuration for the HfAutoModelForCausalLM.
run.Config[pl.LightningModule]: Configuration for the HFAutoModelForCausalLM.
Examples:
CLI usage:
$ nemo llm pretrain --factory 'HfAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")'
$ nemo llm pretrain --factory 'HFAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")'
Python API usage:
>>> model_config = model(model_name="mistralai/Mistral-Nemo-Instruct-2407")
>>> print(model_config)
"""
return run.Config(HfAutoModelForCausalLM, model_name=model_name, load_pretrained_weights=load_pretrained_weights)
return run.Config(HFAutoModelForCausalLM, model_name=model_name, load_pretrained_weights=load_pretrained_weights)


def trainer(
Expand All @@ -69,7 +69,7 @@ def trainer(
gradient_clip_val: float = 1.0,
) -> run.Config[nl.Trainer]:
"""
Configure the NeMo Lightning Trainer for HfAutoModelForCausalLM.
Configure the NeMo Lightning Trainer for HFAutoModelForCausalLM.
This function sets up the distributed training strategy and other training parameters.
Expand All @@ -91,7 +91,7 @@ def trainer(
Examples:
CLI usage:
$ nemo llm pretrain trainer=HfAutoModelForCausalLM ...
$ nemo llm pretrain trainer=HFAutoModelForCausalLM ...
Python API usage:
>>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
Expand Down Expand Up @@ -131,7 +131,7 @@ def pretrain_recipe(
model_name: str = '',
) -> run.Partial:
"""
Create a pre-training recipe for a HfAutoModelForCausalLM model.
Create a pre-training recipe for a HFAutoModelForCausalLM model.
This function sets up a complete configuration for pre-training, including
model, trainer, data, logging, optimization, and resumption settings.
Expand All @@ -148,7 +148,7 @@ def pretrain_recipe(
Examples:
CLI usage:
$ nemo llm pretrain --factory 'HfAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")'
$ nemo llm pretrain --factory 'HFAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")'
Python API usage:
>>> recipe = pretrain_recipe(name="auto_pretrain", num_nodes=2, model_name="mistralai/Mistral-Nemo-Instruct-2407")
Expand Down Expand Up @@ -179,7 +179,7 @@ def finetune_recipe(
model_name: str = '',
) -> run.Partial:
"""
Create a fine-tuning recipe for a HfAutoModelForCausalLM model.
Create a fine-tuning recipe for a HFAutoModelForCausalLM model.
This function sets up a complete configuration for fine-tuning, including
model, trainer, data, logging, optimization, and resumption settings.
Expand Down
3 changes: 0 additions & 3 deletions nemo/lightning/io/pl.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,6 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
checkpoint_dir = ckpt_to_weights_subdir(path, is_saving=True)

fs = get_filesystem(checkpoint_dir)
if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
return
fs.makedirs(checkpoint_dir, exist_ok=True)

validate_sharding_integrity = not (self.validated_consistency and self.assume_constant_structure)
Expand Down
14 changes: 10 additions & 4 deletions nemo/lightning/pytorch/callbacks/nsys.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,14 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx: int) -> Opt
"""

device = trainer.strategy.root_device
current_step = trainer.strategy.current_epoch_step
try:
# Not all strategies have this. e.g.:
# AttributeError: 'SingleDeviceStrategy' object has no attribute 'current_epoch_step'
current_step = trainer.strategy.current_epoch_step
except AttributeError:
current_step = self._nsys_profile_start_step
if device.type == 'cuda':
if current_step == self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
logging.info("====== Start nsys profiling ======")
torch.cuda.cudart().cudaProfilerStart()
if self._nsys_profile_gen_shape:
torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
Expand All @@ -91,9 +95,11 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int)
"""

device = trainer.strategy.root_device
current_step = trainer.strategy.current_epoch_step
try:
current_step = trainer.strategy.current_epoch_step
except AttributeError:
current_step = self._nsys_profile_end_step
if device.type == 'cuda':
if current_step == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
logging.info("====== End nsys profiling ======")
torch.cuda.cudart().cudaProfilerStop()
torch.autograd.profiler.emit_nvtx().__exit__(None, None, None)
1 change: 1 addition & 0 deletions nemo/lightning/pytorch/strategies/fsdp_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import shutil
from collections import OrderedDict
from pathlib import Path
Expand Down
2 changes: 1 addition & 1 deletion nemo/lightning/pytorch/strategies/megatron_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def connect(self, model: pl.LightningModule) -> None:
"""Attaches a model to strategy."""
super().connect(model)

assert not 'is_hf_model' in model.__dict__, "Cannot use HfAutoModelForCausalLM with MegatronParallel"
assert not 'is_hf_model' in model.__dict__, "Cannot use HFAutoModelForCausalLM with MegatronParallel"

dtype_config = getattr(self._precision_plugin, "dtype_config", None)
if self.pipeline_dtype is None and dtype_config:
Expand Down
38 changes: 27 additions & 11 deletions nemo/lightning/resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,25 @@


def _try_restore_tokenizer(model, ckpt_path):
from nemo.collections.common.tokenizers import TokenizerSpec
from nemo.lightning.io import load_context

try:
tokenizer = load_context(ckpt_path, "model.tokenizer")
except ValueError as e:
logging.warning(
f"Encountered error while trying to restore tokenizer. Tokenizer is not restored. " f"Original error: {e}"
)
return model

if isinstance(tokenizer, TokenizerSpec):
model.tokenizer = tokenizer
model.__io__.tokenizer = tokenizer.__io__
except:
# Ignore if the ckpt doesn't have a tokenizer.
pass
finally:
return model
else:
# Ignore if the ckpt doesn't have a tokenizer. type(tokenizer)==TrainerContext in this case.
logging.warning("Checkpoint does not have model.tokenizer field. Tokenizer is not restored.")

return model


@dataclass(kw_only=True)
Expand All @@ -56,8 +64,10 @@ class AutoResume:
checkpoints in NeMo.
Attributes:
restore_config (Optional[RestoreConfig]): Optional config for selectively restoring specific parts like model weights, optimizer states, etc.
If the config contains a path from HF or another non-NeMo checkpoint format, the checkpoint will be automatically converted to a NeMo compatible format.
restore_config (Optional[RestoreConfig]): Optional config for selectively restoring specific parts like model
weights, optimizer states, etc.
If the config contains a path from HF or another non-NeMo checkpoint format, the checkpoint will be
automatically converted to a NeMo compatible format.
resume_from_folder or the run's log_dir takes precedence over restore_config.
resume_from_directory (str): Path to the checkpointing directory to restore from.
resume_from_path (str): Path to a specific checkpoint to restore from.
Expand Down Expand Up @@ -209,17 +219,22 @@ def _find_trainer_ckpt_path(self) -> Optional[Path]:

if not checkpoint_dir.exists() or (not len(end_checkpoints) > 0 and not len(last_checkpoints) > 0):
if self.resume_ignore_no_checkpoint:
warn = f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :{checkpoint_dir}. "
warn = (
f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir "
f":{checkpoint_dir}. "
)
if checkpoint is None:
warn += "Training from scratch."
logging.warning(warn)
else:
if self.restore_config:
# resume_if_exists is True but run is not resumable. Do not fail and try to do selective restore later instead.
# resume_if_exists is True but run is not resumable. Do not fail and try to do selective restore
# later instead.
return None
else:
raise NotFoundError(
f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Cannot resume."
f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir "
f":{checkpoint_dir}. Cannot resume."
)
elif len(end_checkpoints) > 0:
if not self.resume_past_end:
Expand All @@ -240,7 +255,8 @@ def _find_trainer_ckpt_path(self) -> Optional[Path]:
# Select the checkpoint with the latest modified time
checkpoint = sorted(last_checkpoints, key=lambda pth: pth.lstat().st_mtime, reverse=True)[0]
logging.warning(
f"Multiple checkpoints {last_checkpoints} matches *last.ckpt. Selecting one with the latest modified time."
f"Multiple checkpoints {last_checkpoints} matches *last.ckpt. Selecting one with the latest "
f"modified time."
)
else:
checkpoint = last_checkpoints[0]
Expand Down
2 changes: 2 additions & 0 deletions tests/collections/llm/test_mnist_model_nemo2_fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,7 @@ def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu():
every_n_train_steps=5,
# Enables the .nemo file-like checkpointing where all IOMixins are under SerDe
always_save_context=True,
filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}",
)
root_dir = tmpdir
save_dir = root_dir / name
Expand Down Expand Up @@ -572,6 +573,7 @@ def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu():
global_batch_size=2,
output_log=False, # Disable logs to support predict_step
),
ckpt_load_optimizer=False,
)
predict_trainer = nl.Trainer(
accelerator="gpu",
Expand Down

0 comments on commit ea4c2a7

Please sign in to comment.