From 9b91df58e57cb817d7f5f17f612e7e2f870e31ec Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Mon, 8 Jul 2024 13:50:47 -0700 Subject: [PATCH 001/173] Nemotron export - fixing megatron_export.py (#9625) * Nemotron ONNX export fixed Signed-off-by: Boris Fomitchev * Cleanup Signed-off-by: Boris Fomitchev * Addressing code review comments Signed-off-by: Boris Fomitchev --------- Signed-off-by: Boris Fomitchev Co-authored-by: Eric Harper --- nemo/utils/export_utils.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py index c44530944051..534598097bf4 100644 --- a/nemo/utils/export_utils.py +++ b/nemo/utils/export_utils.py @@ -72,10 +72,12 @@ def __init__(self, weight, bias, skip_bias_add): self.weight = weight self.skip_bias_add = skip_bias_add - def forward(self, x): + def forward(self, x, weight=None): + if weight is None: + weight = self.weight if self.skip_bias_add: - return F.linear(x, self.weight), self.bias - return F.linear(x, self.weight, self.bias), None + return F.linear(x, weight), self.bias + return F.linear(x, weight, self.bias), None def get_export_format(filename: str): @@ -239,7 +241,8 @@ def run_ort_and_compare(sess, ort_input, output_example, check_tolerance=0.01): from apex.contrib.layer_norm.layer_norm import FastLayerNorm from apex.normalization import MixedFusedRMSNorm from apex.normalization.fused_layer_norm import FusedLayerNorm, MixedFusedLayerNorm - from apex.transformer.functional.fused_softmax import FusedScaleMaskSoftmax + from megatron.core.fusions.fused_layer_norm import FusedLayerNorm as MCoreFusedLayerNorm + from megatron.core.fusions.fused_softmax import FusedScaleMaskSoftmax from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear def replace_FusedLayerNorm(n: nn.Module) -> Optional[nn.LayerNorm]: @@ -255,21 +258,17 @@ def replace_FusedLayerNorm(n: nn.Module) -> Optional[nn.LayerNorm]: if isinstance(n, FusedLayerNorm) or isinstance(n, MixedFusedLayerNorm): shape, eps, affine = n.normalized_shape, n.eps, n.elementwise_affine - n_state = n.state_dict() + elif isinstance(n, MCoreFusedLayerNorm): + shape, eps, affine = n.weight.shape, n.eps, True elif isinstance(n, FastLayerNorm): shape, eps, affine = n.weight.shape, n.epsilon, True - n_state = n.state_dict() - elif isinstance(n, MixedFusedRMSNorm): - shape, eps, affine = n.normalized_shape, n.eps, n.elementwise_affine - tmp_n_state = n.state_dict() - n_state = {'weight': tmp_n_state['weight'], 'bias': torch.zeros_like(tmp_n_state['weight'])} else: return None n_state = n.state_dict() mod = nn.LayerNorm(shape, eps=eps, elementwise_affine=affine, device=p.device, dtype=p.dtype) - mod.load_state_dict(n_state) + mod.load_state_dict(n_state, strict=True) return mod @@ -306,7 +305,7 @@ def replace_ParallelLinear(n: nn.Module) -> Optional[nn.Linear]: mod = LinearWithBiasSkip(n.weight, n.bias, n.skip_bias_add).to(dev) n_state = n.state_dict() - mod.load_state_dict(n_state) + mod.load_state_dict(n_state, strict=False) return mod def replace_FusedScaleMaskSoftmax(n: nn.Module) -> Optional[nn.Linear]: @@ -318,7 +317,7 @@ def replace_FusedScaleMaskSoftmax(n: nn.Module) -> Optional[nn.Linear]: Equivalent LayerNorm module """ if not isinstance(n, FusedScaleMaskSoftmax): - logging.warning("This function can only change the FusedScaleMaskSoftmax module.") + logging.warning(f"This function can only change the FusedScaleMaskSoftmax module, got: {n.__class__}") return n # disable the fusion only @@ -331,6 +330,7 @@ def replace_FusedScaleMaskSoftmax(n: nn.Module) -> Optional[nn.Linear]: default_Apex_replacements = { "FusedLayerNorm": replace_FusedLayerNorm, "MixedFusedLayerNorm": replace_FusedLayerNorm, + "MCoreFusedLayerNorm": replace_FusedLayerNorm, "FastLayerNorm": replace_FusedLayerNorm, "RowParallelLinear": replace_ParallelLinear, "ColumnParallelLinear": replace_ParallelLinear, From 62459cc45af964f7f754c1c49c72559bcce4fb64 Mon Sep 17 00:00:00 2001 From: Ao Tang Date: Mon, 8 Jul 2024 17:13:55 -0400 Subject: [PATCH 002/173] support lora when kv_channel != hidden_size / num_heads (#9636) --- nemo/collections/nlp/parts/peft_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py index 50c97e349885..726ca33611d7 100644 --- a/nemo/collections/nlp/parts/peft_config.py +++ b/nemo/collections/nlp/parts/peft_config.py @@ -170,7 +170,7 @@ def __init__(self, cfg): elif module == PEFT_MODULE_MAP["dense_module"]: adapter_cfg = self._create_lora_config( - cfg, lora_cfg, cfg.hidden_size, cfg.hidden_size, LoraDenseAttentionAdapterConfig + cfg, lora_cfg, projection_size, cfg.hidden_size, LoraDenseAttentionAdapterConfig ) name_key_to_cfg[AdapterName.LORA_DENSE_ATTENTION_ADAPTER] = adapter_cfg name_key_to_mcore_mixins[AdapterName.LORA_DENSE_ATTENTION_ADAPTER] = [ From 55ee9f454229b5075bd29ee9e60b08c719cb7681 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Tue, 9 Jul 2024 08:24:01 -0700 Subject: [PATCH 003/173] [Nemo CICD] Docker temp files auto-cleanup (#9642) * Docker cleanup --- .github/workflows/_test_template.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index 0dbb1d50ee52..ebdc99cef847 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -34,6 +34,13 @@ on: description: Last 2000 characters of the test step's log value: ${{ jobs.main.outputs.log }} jobs: + runner-auto-clean: + runs-on: ${{ inputs.RUNNER }} + steps: + - name: Docker system cleanup + run: | + docker system prune -a --filter "until=48h" --force + main: runs-on: ${{ inputs.RUNNER }} outputs: From b97da9cea5d8be50733ce7bb9a3ea1297f0ce54d Mon Sep 17 00:00:00 2001 From: huvunvidia <86480512+huvunvidia@users.noreply.github.com> Date: Tue, 9 Jul 2024 11:24:45 -0400 Subject: [PATCH 004/173] Update Dockerfile.ci (#9651) Signed-off-by: huvunvidia <86480512+huvunvidia@users.noreply.github.com> --- Dockerfile.ci | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.ci b/Dockerfile.ci index dd8af593768f..55c31e47f6d3 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -47,6 +47,7 @@ pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.n "megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \ "nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \ "apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \ +"unstructured==0.14.9" \ "llama-index==0.10.43" \ "onnxscript @ git+https://github.com/microsoft/onnxscript" \ -r tools/ctc_segmentation/requirements.txt \ From 1c73e1bff880e922d1af446e7a3dc08d8a195a5f Mon Sep 17 00:00:00 2001 From: Rohit Jena Date: Tue, 9 Jul 2024 08:55:32 -0700 Subject: [PATCH 005/173] SDXL improvements (and support for Draft+) [DRAFT PR] (#9543) * add slurm files to .gitignore * add differentiable decode to SDXL VAE * Optionally return predicted noise during the single step sampling process * also change `get_gamma` as a new function to use inside other functions which may interact with sampling (e.g. draft+) * debugging sdunet converter script * Added SD/SDXL conversion script from HF to NeMo * added 'from_nemo' config for VAE * tmp commit, please make changes (oci is super slow, cannot even run vim) * new inference yaml works * add logging to autoencoder * !(dont squash) Added enabling support for LinearWrapper for SDLoRA * added samples_per_batch and fsdp arguments to SDXL inference * added extra optionally wrapper to FSDP * remove unncessary comments * remove unnecessary comments * Apply isort and black reformatting Signed-off-by: yaoyu-33 --------- Signed-off-by: yaoyu-33 Co-authored-by: Rohit Jena Co-authored-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: yaoyu-33 --- .gitignore | 2 + .../stable_diffusion/conf/sd_train.yaml | 1 - .../stable_diffusion/conf/sd_xl_base.yaml | 2 - .../conf/sd_xl_base_train.yaml | 1 - .../stable_diffusion/conf/sd_xl_infer.yaml | 10 +- .../stable_diffusion/conf/sd_xl_infer_v2.yaml | 189 ++++++++ .../stable_diffusion/sd_train.py | 8 +- .../stable_diffusion/sd_xl_infer.py | 44 +- .../stable_diffusion/sd_xl_train.py | 7 +- .../stable_diffusion/diffusion_engine.py | 91 ++-- .../stable_diffusion/ldm/autoencoder.py | 20 +- .../modules/stable_diffusion/attention.py | 4 + .../diffusionmodules/denoiser.py | 9 +- .../diffusionmodules/openaimodel.py | 14 +- .../diffusionmodules/sampling.py | 62 ++- .../diffusionmodules/wrappers.py | 7 +- nemo/collections/multimodal/parts/utils.py | 29 +- .../language_modeling/megatron_base_model.py | 2 + .../nlp/parts/mixins/nlp_adapter_mixins.py | 1 - nemo/collections/nlp/parts/nlp_overrides.py | 6 + nemo/core/classes/mixins/adapter_mixins.py | 8 + .../convert_stablediffusion_hf_to_nemo.py | 452 ++++++++++++++++++ 22 files changed, 880 insertions(+), 89 deletions(-) create mode 100644 examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer_v2.yaml create mode 100644 scripts/checkpoint_converters/convert_stablediffusion_hf_to_nemo.py diff --git a/.gitignore b/.gitignore index 1ff2a92cac64..1aa5ef00de5e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ *.pkl #*.ipynb output +output_2048 result *.pt tests/data/asr @@ -179,3 +180,4 @@ examples/neural_graphs/*.yml .hydra/ nemo_experiments/ +slurm*.out diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml index dff963590864..da03a1de96cf 100644 --- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml +++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml @@ -17,7 +17,6 @@ trainer: enable_model_summary: True limit_val_batches: 0 - exp_manager: exp_dir: null name: ${name} diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base.yaml index c536bae15926..7e83093eb780 100644 --- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base.yaml +++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base.yaml @@ -58,8 +58,6 @@ model: lossconfig: target: torch.nn.Identity - - conditioner_config: _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner emb_models: diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml index 7aa765db2e5f..aa1d2782d15b 100644 --- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml +++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml @@ -125,7 +125,6 @@ model: target: torch.nn.Identity - conditioner_config: _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner emb_models: diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer.yaml index eb1f6d7ccb8e..632f1634af50 100644 --- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer.yaml +++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer.yaml @@ -31,9 +31,9 @@ infer: sampling: base: sampler: EulerEDMSampler - width: 256 - height: 256 - steps: 40 + width: 512 + height: 512 + steps: 50 discretization: "LegacyDDPMDiscretization" guider: "VanillaCFG" thresholder: "None" @@ -48,8 +48,8 @@ sampling: s_noise: 1.0 eta: 1.0 order: 4 - orig_width: 1024 - orig_height: 1024 + orig_width: 512 + orig_height: 512 crop_coords_top: 0 crop_coords_left: 0 aesthetic_score: 5.0 diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer_v2.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer_v2.yaml new file mode 100644 index 000000000000..9dc838dcc5c5 --- /dev/null +++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer_v2.yaml @@ -0,0 +1,189 @@ +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + precision: 32 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. + max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 + accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models + gradient_clip_val: 1.0 + benchmark: False + enable_model_summary: True + limit_val_batches: 0 + + +infer: + num_samples_per_batch: 1 + num_samples: 4 + prompt: + - "A professional photograph of an astronaut riding a pig" + - 'A photo of a Shiba Inu dog with a backpack riding a bike. It is wearing sunglasses and a beach hat.' + - 'A cute corgi lives in a house made out of sushi.' + - 'A high contrast portrait of a very happy fuzzy panda dressed as a chef in a high end kitchen making dough. There is a painting of flowers on the wall behind him.' + - 'A brain riding a rocketship heading towards the moon.' + negative_prompt: "" + seed: 123 + + +sampling: + base: + sampler: EulerEDMSampler + width: 512 + height: 512 + steps: 50 + discretization: "LegacyDDPMDiscretization" + guider: "VanillaCFG" + thresholder: "None" + scale: 5.0 + img2img_strength: 1.0 + sigma_min: 0.0292 + sigma_max: 14.6146 + rho: 3.0 + s_churn: 0.0 + s_tmin: 0.0 + s_tmax: 999.0 + s_noise: 1.0 + eta: 1.0 + order: 4 + orig_width: 512 + orig_height: 512 + crop_coords_top: 0 + crop_coords_left: 0 + aesthetic_score: 5.0 + negative_aesthetic_score: 5.0 + +# model: +# is_legacy: False + +use_refiner: False +use_fp16: False # use fp16 model weights +out_path: ./output + +base_model_config: /opt/NeMo/examples/multimodal/generative/stable_diffusion/conf/sd_xl_base.yaml +refiner_config: /opt/NeMo/examples/multimodal/generative/stable_diffusion/conf/sd_xl_refiner.yaml + +model: + scale_factor: 0.13025 + disable_first_stage_autocast: True + is_legacy: False + restore_from_path: "" + + fsdp: False + fsdp_set_buffer_dtype: null + fsdp_sharding_strategy: 'full' + use_cpu_initialization: True + # hidden_size: 4 + # pipeline_model_parallel_size: 4 + + optim: + name: fused_adam + lr: 1e-4 + weight_decay: 0.0 + betas: + - 0.9 + - 0.999 + sched: + name: WarmupHoldPolicy + warmup_steps: 10 + hold_steps: 10000000000000 # Incredibly large value to hold the lr as constant + + denoiser_config: + _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser + num_idx: 1000 + + weighting_config: + _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting + scaling_config: + _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization + + unet_config: + _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel + from_pretrained: /opt/nemo-aligner/checkpoints/sdxl/unet_nemo.ckpt + from_NeMo: True + adm_in_channels: 2816 + num_classes: sequential + use_checkpoint: False + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4 ] + num_head_channels: 64 + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: [ 1, 2, 10 ] # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16 + context_dim: 2048 + image_size: 64 # unused +# spatial_transformer_attn_type: softmax #note: only default softmax is supported now + legacy: False + use_flash_attention: False + + first_stage_config: + # _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper + _target_: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper + from_pretrained: /opt/nemo-aligner/checkpoints/sdxl/vae_nemo.ckpt + from_NeMo: True + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [ 1, 2, 4, 4 ] + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + conditioner_config: + _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner + emb_models: + # crossattn cond + - is_trainable: False + input_key: txt + emb_model: + _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder + layer: hidden + layer_idx: 11 + # crossattn and vector cond + - is_trainable: False + input_key: txt + emb_model: + _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2 + arch: ViT-bigG-14 + version: laion2b_s39b_b160k + freeze: True + layer: penultimate + always_return_pooled: True + legacy: False + # vector cond + - is_trainable: False + input_key: original_size_as_tuple + emb_model: + _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND + outdim: 256 # multiplied by two + # vector cond + - is_trainable: False + input_key: crop_coords_top_left + emb_model: + _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND + outdim: 256 # multiplied by two + # vector cond + - is_trainable: False + input_key: target_size_as_tuple + emb_model: + _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND + outdim: 256 # multiplied by two + diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py index 968d9bec2884..7e151699b38c 100644 --- a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py +++ b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py @@ -74,7 +74,11 @@ def main(cfg) -> None: n, c, h = cfg.model.micro_batch_size, cfg.model.channels, cfg.model.image_size x = torch.randn((n, c, h, h), dtype=torch.float32, device="cuda") t = torch.randint(77, (n,), device="cuda") - cc = torch.randn((n, 77, cfg.model.unet_config.context_dim), dtype=torch.float32, device="cuda",) + cc = torch.randn( + (n, 77, cfg.model.unet_config.context_dim), + dtype=torch.float32, + device="cuda", + ) if cfg.model.precision in [16, '16']: x = x.type(torch.float16) cc = cc.type(torch.float16) @@ -93,9 +97,7 @@ def main(cfg) -> None: model.zero_grad() if cfg.model.get('peft', None): - peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme] - if cfg.model.peft.restore_from_path is not None: # initialize peft weights from a checkpoint instead of randomly # This is not the same as resume training because optimizer states are not restored. diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py index 8d18be517c69..981e83ec95c4 100644 --- a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py +++ b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py @@ -26,32 +26,44 @@ def model_cfg_modifier(model_cfg): model_cfg.precision = cfg.trainer.precision model_cfg.ckpt_path = None model_cfg.inductor = False - model_cfg.unet_config.from_pretrained = None - model_cfg.first_stage_config.from_pretrained = None + model_cfg.unet_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/unet_nemo.ckpt" + model_cfg.unet_config.from_NeMo = True + model_cfg.first_stage_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/vae_nemo.ckpt" + model_cfg.first_stage_config.from_NeMo = True model_cfg.first_stage_config._target_ = 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper' - model_cfg.fsdp = False + # model_cfg.fsdp = True torch.backends.cuda.matmul.allow_tf32 = True trainer, megatron_diffusion_model = setup_trainer_and_model_for_inference( model_provider=MegatronDiffusionEngine, cfg=cfg, model_cfg_modifier=model_cfg_modifier ) + ### Manually configure sharded model + # model = megatron_diffusion_model + # model = trainer.strategy._setup_model(model) + # model = model.cuda(torch.cuda.current_device()) + # get the diffusion part only model = megatron_diffusion_model.model model.cuda().eval() - base = SamplingPipeline(model, use_fp16=cfg.use_fp16, is_legacy=cfg.model.is_legacy) - use_refiner = cfg.get('use_refiner', False) - for i, prompt in enumerate(cfg.infer.prompt): - samples = base.text_to_image( - params=cfg.sampling.base, - prompt=[prompt], - negative_prompt=cfg.infer.negative_prompt, - samples=cfg.infer.num_samples, - return_latents=True if use_refiner else False, - seed=int(cfg.infer.seed + i * 100), - ) - - perform_save_locally(cfg.out_path, samples) + with torch.no_grad(): + base = SamplingPipeline(model, use_fp16=cfg.use_fp16, is_legacy=cfg.model.is_legacy) + use_refiner = cfg.get('use_refiner', False) + num_samples_per_batch = cfg.infer.get('num_samples_per_batch', cfg.infer.num_samples) + num_batches = cfg.infer.num_samples // num_samples_per_batch + + for i, prompt in enumerate(cfg.infer.prompt): + for batchid in range(num_batches): + samples = base.text_to_image( + params=cfg.sampling.base, + prompt=[prompt], + negative_prompt=cfg.infer.negative_prompt, + samples=num_samples_per_batch, + return_latents=True if use_refiner else False, + seed=int(cfg.infer.seed + i * 100 + batchid * 200), + ) + # samples=cfg.infer.num_samples, + perform_save_locally(cfg.out_path, samples) if __name__ == "__main__": diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py index a91beca93761..44412aee0d14 100644 --- a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py +++ b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py @@ -41,7 +41,10 @@ def _training_strategy(self) -> NLPDDPStrategy: _IS_INTERACTIVE = hasattr(sys, "ps1") or bool(sys.flags.interactive) if _IS_INTERACTIVE and self.cfg.trainer.devices == 1: logging.info("Detected interactive environment, using NLPDDPStrategyNotebook") - return NLPDDPStrategyNotebook(no_ddp_communication_hook=True, find_unused_parameters=False,) + return NLPDDPStrategyNotebook( + no_ddp_communication_hook=True, + find_unused_parameters=False, + ) if self.cfg.model.get('fsdp', False): assert ( @@ -81,9 +84,7 @@ def main(cfg) -> None: model = MegatronDiffusionEngine(cfg.model, trainer) if cfg.model.get('peft', None): - peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme] - if cfg.model.peft.restore_from_path is not None: # initialize peft weights from a checkpoint instead of randomly # This is not the same as resume training because optimizer states are not restored. diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py index efc1550113a0..755588202ef0 100644 --- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py +++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py @@ -119,7 +119,9 @@ def __init__(self, cfg, model_parallel_config): self._init_first_stage(first_stage_config) self.model_type = None - self.rng = torch.Generator(device=torch.cuda.current_device(),) + self.rng = torch.Generator( + device=torch.cuda.current_device(), + ) self.use_ema = False # TODO use_ema need to switch to NeMo style if self.use_ema: @@ -158,6 +160,13 @@ def decode_first_stage(self, z): out = self.first_stage_model.decode(z) return out + # same as above but differentiable + def differentiable_decode_first_stage(self, z): + z = 1.0 / self.scale_factor * z + with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast): + out = self.first_stage_model.decode(z) + return out + @torch.no_grad() def encode_first_stage(self, x): with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast): @@ -185,7 +194,12 @@ def training_step(self, batch, batch_idx): self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=False) self.log( - "global_step", self.global_step, prog_bar=True, logger=True, on_step=True, on_epoch=False, + "global_step", + self.global_step, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, ) if self.scheduler_config is not None: @@ -231,7 +245,11 @@ def configure_optimizers(self): scheduler = DiffusionEngine.from_config_dict(self.scheduler_config) print("Setting up LambdaLR scheduler...") scheduler = [ - {"scheduler": LambdaLR(opt, lr_lambda=scheduler.schedule), "interval": "step", "frequency": 1,} + { + "scheduler": LambdaLR(opt, lr_lambda=scheduler.schedule), + "interval": "step", + "frequency": 1, + } ] return [opt], scheduler return opt @@ -291,7 +309,14 @@ def set_input_tensor(self, input_tensor): pass @torch.no_grad() - def log_images(self, batch: Dict, N: int = 8, sample: bool = True, ucg_keys: List[str] = None, **kwargs,) -> Dict: + def log_images( + self, + batch: Dict, + N: int = 8, + sample: bool = True, + ucg_keys: List[str] = None, + **kwargs, + ) -> Dict: conditioner_input_keys = [e.input_key for e in self.conditioner.embedders] if ucg_keys: assert all(map(lambda x: x in conditioner_input_keys, ucg_keys)), ( @@ -305,7 +330,8 @@ def log_images(self, batch: Dict, N: int = 8, sample: bool = True, ucg_keys: Lis x = self.get_input(batch) c, uc = self.conditioner.get_unconditional_conditioning( - batch, force_uc_zero_embeddings=ucg_keys if len(self.conditioner.embedders) > 0 else [], + batch, + force_uc_zero_embeddings=ucg_keys if len(self.conditioner.embedders) > 0 else [], ) sampling_kwargs = {} @@ -400,7 +426,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only): # handle asynchronous grad reduction no_sync_func = None if not forward_only and self.with_distributed_adam: - no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,) + no_sync_func = partial( + self._optimizer.no_sync, + greedy_grad_copy=self.megatron_amp_O2, + ) # pipeline schedules will get these from self.model.config for module in self.get_module_list(): @@ -438,12 +467,12 @@ def fwd_bwd_step(self, dataloader_iter, forward_only): def training_step(self, dataloader_iter): """ - Our dataloaders produce a micro-batch and then we fetch - a number of microbatches depending on the global batch size and model parallel size - from the dataloader to produce a list of microbatches. - Batch should be a list of microbatches and those microbatches should on CPU. - Microbatches are then moved to GPU during the pipeline. - The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions. + Our dataloaders produce a micro-batch and then we fetch + a number of microbatches depending on the global batch size and model parallel size + from the dataloader to produce a list of microbatches. + Batch should be a list of microbatches and those microbatches should on CPU. + Microbatches are then moved to GPU during the pipeline. + The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions. """ self._optimizer.zero_grad() @@ -491,20 +520,20 @@ def training_step(self, dataloader_iter): return loss_mean def backward(self, *args, **kwargs): - """ LightningModule hook to do backward. - We want this to do nothing since we run backward in the fwd/bwd functions from apex. - No need to call it here. + """LightningModule hook to do backward. + We want this to do nothing since we run backward in the fwd/bwd functions from apex. + No need to call it here. """ pass def optimizer_zero_grad(self, *args, **kwargs): - """ LightningModule hook to zero grad. - We want this to do nothing as we are zeroing grads during the training_step. + """LightningModule hook to zero grad. + We want this to do nothing as we are zeroing grads during the training_step. """ pass def _append_sequence_parallel_module_grads(self, module, grads): - """ Helper method for allreduce_sequence_parallel_gradients""" + """Helper method for allreduce_sequence_parallel_gradients""" for param in module.parameters(): sequence_parallel_param = getattr(param, 'sequence_parallel', False) @@ -517,12 +546,13 @@ def _append_sequence_parallel_module_grads(self, module, grads): def get_forward_output_and_loss_func(self): def process_batch(batch): - """ Prepares the global batch for apex fwd/bwd functions. - Global batch is a list of micro batches. + """Prepares the global batch for apex fwd/bwd functions. + Global batch is a list of micro batches. """ # SD has more dedicated structure for encoding, so we enable autocasting here as well with torch.cuda.amp.autocast( - self.autocast_dtype in (torch.half, torch.bfloat16), dtype=self.autocast_dtype, + self.autocast_dtype in (torch.half, torch.bfloat16), + dtype=self.autocast_dtype, ): if self.model.precache_mode == 'both': x = batch[self.model.input_key].to(torch.cuda.current_device()) @@ -565,7 +595,7 @@ def validation_step(self, dataloader_iter, batch_idx): return loss def setup(self, stage=None): - """ PTL hook that is executed after DDP spawns. + """PTL hook that is executed after DDP spawns. We setup datasets here as megatron datasets require DDP to instantiate. See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information. Args: @@ -678,20 +708,23 @@ def setup_test_data(self, cfg): f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds)} and consumed samples: {consumed_samples}' ) self._test_dl = torch.utils.data.DataLoader( - self._test_ds, batch_size=self._micro_batch_size, num_workers=cfg.num_workers, pin_memory=True, + self._test_ds, + batch_size=self._micro_batch_size, + num_workers=cfg.num_workers, + pin_memory=True, ) def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any: - """ PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device - When using pipeline parallelism, we need the global batch to remain on the CPU, - since the memory overhead will be too high when using a large number of microbatches. - Microbatches are transferred from CPU to GPU inside the pipeline. + """PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device + When using pipeline parallelism, we need the global batch to remain on the CPU, + since the memory overhead will be too high when using a large number of microbatches. + Microbatches are transferred from CPU to GPU inside the pipeline. """ return batch def _validate_trainer(self): - """ Certain trainer configurations can break training. - Here we try to catch them and raise an error. + """Certain trainer configurations can break training. + Here we try to catch them and raise an error. """ if self.trainer.accumulate_grad_batches > 1: raise ValueError( diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py index 6bd47a78fbcf..d79d85c2e026 100644 --- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py +++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py @@ -16,6 +16,7 @@ import pytorch_lightning as pl import torch import torch.nn.functional as F +from nemo.utils import logging try: from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer @@ -316,6 +317,7 @@ def __init__( ignore_keys=[], image_key="image", colorize_nlabels=None, + from_NeMo=False, monitor=None, from_pretrained: str = None, ): @@ -337,6 +339,7 @@ def __init__( self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) if from_pretrained is not None: + logging.info(f"Attempting to load vae weights from {from_pretrained}") if from_pretrained.endswith('safetensors'): from safetensors.torch import load_file as load_safetensors @@ -345,7 +348,7 @@ def __init__( state_dict = torch.load(from_pretrained) if 'state_dict' in state_dict: state_dict = state_dict['state_dict'] - missing_key, unexpected_key, _, _ = self._load_pretrained_model(state_dict) + missing_key, unexpected_key, _, _ = self._load_pretrained_model(state_dict, from_NeMo=from_NeMo) if len(missing_key) > 0: print( f'{self.__class__.__name__}: Following keys are missing during loading VAE weights, which may lead to compromised image quality for a resumed training. Please check the checkpoint you provided.' @@ -395,8 +398,9 @@ def _state_key_mapping(self, state_dict: dict): res_dict[key_] = val_ return res_dict - def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False): - state_dict = self._state_key_mapping(state_dict) + def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False, from_NeMo=False): + if not from_NeMo: + state_dict = self._state_key_mapping(state_dict) model_state_dict = self.state_dict() loaded_keys = [k for k in state_dict.keys()] expected_keys = list(model_state_dict.keys()) @@ -405,7 +409,10 @@ def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False): unexpected_keys = list(set(loaded_keys) - set(expected_keys)) def _find_mismatched_keys( - state_dict, model_state_dict, loaded_keys, ignore_mismatched_sizes, + state_dict, + model_state_dict, + loaded_keys, + ignore_mismatched_sizes, ): mismatched_keys = [] if ignore_mismatched_sizes: @@ -440,7 +447,10 @@ def _find_mismatched_keys( if state_dict is not None: # Whole checkpoint mismatched_keys = _find_mismatched_keys( - state_dict, model_state_dict, original_loaded_keys, ignore_mismatched_sizes, + state_dict, + model_state_dict, + original_loaded_keys, + ignore_mismatched_sizes, ) error_msgs = self._load_state_dict_into_model(state_dict) return missing_keys, unexpected_keys, mismatched_keys, error_msgs diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py index 2eeed97db781..e748bcbf93a0 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/attention.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/attention.py @@ -227,6 +227,10 @@ def __init__(self, in_features, out_features, bias=True, lora_network_alpha=None def forward(self, x): mixed_x = super().forward(x) if self.is_adapter_available(): + # return this output if lora is not enabled + cfg = self.get_adapter_cfg(AdapterName.PARALLEL_LINEAR_ADAPTER) + if not cfg['enabled']: + return mixed_x lora_linear_adapter = self.get_adapter_module(AdapterName.PARALLEL_LINEAR_ADAPTER) lora_mixed_x = lora_linear_adapter(x) # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script. diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser.py index df1f27449bd1..a358bb08f92d 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser.py @@ -33,13 +33,18 @@ def possibly_quantize_c_noise(self, c_noise): def w(self, sigma): return self.weighting(sigma) - def __call__(self, network, input, sigma, cond): + def __call__(self, network, input, sigma, cond, return_noise=False): sigma = self.possibly_quantize_sigma(sigma) sigma_shape = sigma.shape sigma = append_dims(sigma, input.ndim) c_skip, c_out, c_in, c_noise = self.scaling(sigma) c_noise = self.possibly_quantize_c_noise(c_noise.reshape(sigma_shape)) - return network(input * c_in, c_noise, cond) * c_out + input * c_skip + # predict noise from network + noise_pred = network(input * c_in, c_noise, cond) + denoised = noise_pred * c_out + input * c_skip + if return_noise: + return denoised, noise_pred + return denoised class DiscreteDenoiser(Denoiser): diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py index 7f8b2fb20bff..eb449c5406b9 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py @@ -789,6 +789,7 @@ def __init__( self.input_blocks.append(TimestepEmbedSequential(*layers)) self._feature_size += ch input_block_chans.append(ch) + if level != len(channel_mult) - 1: out_ch = ch self.input_blocks.append( @@ -954,6 +955,7 @@ def __init__( ) if from_pretrained is not None: + logging.info(f"Attempting to load pretrained unet from {from_pretrained}") if from_pretrained.endswith('safetensors'): from safetensors.torch import load_file as load_safetensors @@ -1021,6 +1023,16 @@ def _input_blocks_mapping(self, input_dict): .replace('conv2', 'out_layers.3') .replace('conv_shortcut', 'skip_connection') ) + ## Rohit: I've changed this to make sure it is compatible + # post_fix = ( + # key_[25:] + # .replace('time_emb_proj', 'emb_layers.1') + # .replace('norm1', 'in_layers.0') + # .replace('norm2', 'out_layers.0') + # .replace('conv1', 'in_layers.1') + # .replace('conv2', 'out_layers.2') + # .replace('conv_shortcut', 'skip_connection') + # ) res_dict["input_blocks." + str(target_id) + '.0.' + post_fix] = value_ elif "attentions" in key_: id_1 = int(key_[26]) @@ -1168,7 +1180,7 @@ def te_fp8_key_mapping(self, unet_dict): return new_state_dict def _state_key_mapping(self, state_dict: dict): - + # state_dict is a HF model res_dict = {} input_dict = {} mid_dict = {} diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sampling.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sampling.py index c636ffec345d..bfae8790eeb2 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sampling.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sampling.py @@ -47,7 +47,12 @@ def __init__( ): self.num_steps = num_steps self.discretization = instantiate_from_config(discretization_config) - self.guider = instantiate_from_config(default(guider_config, DEFAULT_GUIDER,)) + self.guider = instantiate_from_config( + default( + guider_config, + DEFAULT_GUIDER, + ) + ) self.verbose = verbose self.device = device @@ -93,35 +98,50 @@ def euler_step(self, x, d, dt): class EDMSampler(SingleStepDiffusionSampler): def __init__(self, s_churn=0.0, s_tmin=0.0, s_tmax=float("inf"), s_noise=1.0, *args, **kwargs): super().__init__(*args, **kwargs) - self.s_churn = s_churn self.s_tmin = s_tmin self.s_tmax = s_tmax self.s_noise = s_noise - def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc=None, gamma=0.0): + def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc=None, gamma=0.0, return_noise=False): + # x is actually \bar{x} as in the DDIM paper sigma_hat = sigma * (gamma + 1.0) if gamma > 0: eps = torch.randn_like(x) * self.s_noise - x = x + eps * append_dims(sigma_hat ** 2 - sigma ** 2, x.ndim) ** 0.5 + x = x + eps * append_dims(sigma_hat**2 - sigma**2, x.ndim) ** 0.5 denoised = self.denoise(x, denoiser, sigma_hat, cond, uc) + # this is the noise (e_t) d = to_d(x, sigma_hat, denoised) dt = append_dims(next_sigma - sigma_hat, x.ndim) - euler_step = self.euler_step(x, d, dt) + euler_step = self.euler_step(x, d, dt) # this is x_{t-\delta{t}} x = self.possible_correction_step(euler_step, x, d, dt, next_sigma, denoiser, cond, uc) + if return_noise: + return x, d return x + def get_gamma(self, sigmas, num_sigmas, index): + gamma = ( + min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1) if self.s_tmin <= sigmas[index] <= self.s_tmax else 0.0 + ) + return gamma + def __call__(self, denoiser, x, cond, uc=None, num_steps=None): + # prepare_sampling_loop converts x into \bar{x} = x / \sqrt{\tilde{\alpha_t}} x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(x, cond, uc, num_steps) for i in self.get_sigma_gen(num_sigmas): - gamma = ( - min(self.s_churn / (num_sigmas - 1), 2 ** 0.5 - 1) if self.s_tmin <= sigmas[i] <= self.s_tmax else 0.0 + gamma = self.get_gamma(sigmas, num_sigmas, i) + x = self.sampler_step( + s_in * sigmas[i], + s_in * sigmas[i + 1], + denoiser, + x, + cond, + uc, + gamma, ) - x = self.sampler_step(s_in * sigmas[i], s_in * sigmas[i + 1], denoiser, x, cond, uc, gamma,) - return x @@ -151,14 +171,24 @@ def __call__(self, denoiser, x, cond, uc=None, num_steps=None): x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(x, cond, uc, num_steps) for i in self.get_sigma_gen(num_sigmas): - x = self.sampler_step(s_in * sigmas[i], s_in * sigmas[i + 1], denoiser, x, cond, uc,) + x = self.sampler_step( + s_in * sigmas[i], + s_in * sigmas[i + 1], + denoiser, + x, + cond, + uc, + ) return x class LinearMultistepSampler(BaseDiffusionSampler): def __init__( - self, order=4, *args, **kwargs, + self, + order=4, + *args, + **kwargs, ): super().__init__(*args, **kwargs) @@ -276,7 +306,15 @@ def get_mult(self, h, r, t, t_next, previous_sigma): return mult1, mult2 def sampler_step( - self, old_denoised, previous_sigma, sigma, next_sigma, denoiser, x, cond, uc=None, + self, + old_denoised, + previous_sigma, + sigma, + next_sigma, + denoiser, + x, + cond, + uc=None, ): denoised = self.denoise(x, denoiser, sigma, cond, uc) diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/wrappers.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/wrappers.py index 0d465c1275c6..24e2124e6f83 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/wrappers.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/wrappers.py @@ -37,6 +37,11 @@ class OpenAIWrapper(IdentityWrapper): def forward(self, x: torch.Tensor, t: torch.Tensor, c: dict, **kwargs) -> torch.Tensor: if c.get("concat", None): x = torch.cat((x, c.get("concat")), dim=1) + return self.diffusion_model( - x, timesteps=t, context=c.get("crossattn", None), y=c.get("vector", None), **kwargs, + x, + timesteps=t, + context=c.get("crossattn", None), + y=c.get("vector", None), + **kwargs, ) diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py index 7eb72b38d0f0..5a01e8702a9e 100644 --- a/nemo/collections/multimodal/parts/utils.py +++ b/nemo/collections/multimodal/parts/utils.py @@ -23,11 +23,11 @@ from pytorch_lightning import Trainer from pytorch_lightning.plugins.environments import TorchElasticEnvironment from transformers import CLIPImageProcessor, SiglipImageProcessor -from nemo.collections.multimodal.data.clip.augmentations.augmentations import image_transform +from nemo.collections.multimodal.data.clip.augmentations.augmentations import image_transform from nemo.collections.multimodal.data.neva.neva_dataset import process_image from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPFSDPStrategy, NLPSaveRestoreConnector from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision from nemo.utils import AppState, logging @@ -276,10 +276,23 @@ def setup_trainer_and_model_for_inference( # Use the NLPDDPStrategy for the distributed data parallel strategy. # We don't use DDP for async grad allreduce and don't find unused parameters. - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, - find_unused_parameters=False, - ) + if not cfg.model.get('fsdp', False): + logging.info("FSDP is False, using DDP strategy.") + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True, + find_unused_parameters=False, + ) + else: + logging.info("Using FSDP strategy.") + strategy = NLPFSDPStrategy( + limit_all_gathers=cfg.model.get('fsdp_limit_all_gathers', True), + sharding_strategy=cfg.model.get('fsdp_sharding_strategy', 'full'), + cpu_offload=cfg.model.get('fsdp_cpu_offload', True), + grad_reduce_dtype=cfg.model.get('fsdp_grad_reduce_dtype', 32), + precision=cfg.trainer.precision, + # use_orig_params=cfg.model.inductor, + set_buffer_dtype=cfg.get('fsdp_set_buffer_dtype', None), + ) # Set up the trainer with the specified plugins and strategy. trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) @@ -323,7 +336,9 @@ def setup_trainer_and_model_for_inference( ) else: - raise ValueError(f"Unrecognized checkpoint type: {cfg.model.restore_from_path}") + # load a model from scratch + logging.warning("Loading a model from scratch for inference. Tread carefully.") + model = model_provider(cfg=cfg.model, trainer=trainer) # initialize apex DDP strategy def dummy(): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 4ded9a42db4f..e1641a81c0dc 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -1271,6 +1271,8 @@ def find_frozen_submodules(model): # TODO: Currently the main parameter data type is kept in fp32 (when O2=False). This needs to be # extended to support lower precision main parameters. frozen_submodule_names, frozen_submodules = find_frozen_submodules(self.model) + for submodule in frozen_submodule_names: + logging.debug(f"Ignoring state {submodule} in FSDP.") self.trainer.strategy.kwargs['ignored_states'] = frozen_submodules # FSDP requires uniform status of require_grads # Diffusion models like SD has frozen parts and needs to be added to 'ignored_states' from sharding for FSDP to work diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py index 45f4af3cfbf3..2bacaf52e3f8 100644 --- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py +++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py @@ -161,7 +161,6 @@ def _get_layers_from_model(self, model): def _check_and_add_peft_cfg(self, peft_cfg): layer_selection = peft_cfg.layer_selection - assert not self.use_mcore_gpt or hasattr( peft_cfg, 'name_key_to_mcore_mixins' ), f"{peft_cfg.__class__.__name__} is not supported in megatron core mode yet." diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index e251690831cb..b003e310baeb 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -701,6 +701,7 @@ def __init__( nccl_communicator_config_path: Optional[str] = None, sharp: bool = False, set_buffer_dtype: Optional[str] = None, + extra_fsdp_wrap_module: Optional[set] = None, **kwargs: Union[Any, Dict[str, Any]], ) -> None: if not HAVE_APEX: @@ -730,6 +731,11 @@ def __init__( ParallelTransformerLayer, BasicTransformerBlock, } + + # if extra wrap modules are provided, use them + if extra_fsdp_wrap_module is not None: + self.fsdp_wrap_module.update(extra_fsdp_wrap_module) + kwargs['auto_wrap_policy'] = functools.partial( transformer_auto_wrap_policy, transformer_layer_cls=self.fsdp_wrap_module ) diff --git a/nemo/core/classes/mixins/adapter_mixins.py b/nemo/core/classes/mixins/adapter_mixins.py index 05ac9b429d85..7b5d02c86bf7 100644 --- a/nemo/core/classes/mixins/adapter_mixins.py +++ b/nemo/core/classes/mixins/adapter_mixins.py @@ -391,6 +391,14 @@ def get_adapter_module(self, name: str): return self.adapter_layer[name] if name in self.adapter_layer else None return None + def get_adapter_cfg(self, name: str): + """Same logic as `get_adapter_module` but to get the config""" + _, name = self.resolve_adapter_module_name_(name) + + if hasattr(self, "adapter_cfg"): + return self.adapter_cfg[name] if name in self.adapter_cfg else None + return None + def set_accepted_adapter_types(self, adapter_types: List[Union[type, str]]) -> None: """ The module with this mixin can define a list of adapter names that it will accept. diff --git a/scripts/checkpoint_converters/convert_stablediffusion_hf_to_nemo.py b/scripts/checkpoint_converters/convert_stablediffusion_hf_to_nemo.py new file mode 100644 index 000000000000..67bc975708d0 --- /dev/null +++ b/scripts/checkpoint_converters/convert_stablediffusion_hf_to_nemo.py @@ -0,0 +1,452 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r""" +Conversion script to convert HuggingFace Starcoder2 checkpoints into nemo checkpoint. + Example to run this conversion script: + python convert_hf_starcoder2_to_nemo.py \ + --input_name_or_path \ + --output_path +""" + +from argparse import ArgumentParser + +import numpy as np +import safetensors +import torch +import torch.nn + +from nemo.utils import logging + +intkey = lambda x: int(x) + + +def filter_keys(rule, dict): + keys = list(dict.keys()) + nd = {k: dict[k] for k in keys if rule(k)} + return nd + + +def map_keys(rule, dict): + new = {rule(k): v for k, v in dict.items()} + return new + + +def split_name(name, dots=0): + l = name.split(".") + return ".".join(l[: dots + 1]), ".".join(l[dots + 1 :]) + + +def is_prefix(shortstr, longstr): + # is the first string a prefix of the second one + return longstr == shortstr or longstr.startswith(shortstr + ".") + + +def numdots(str): + return str.count(".") + + +class SegTree: + def __init__(self): + self.nodes = dict() + self.val = None + self.final_val = 0 + self.convert_name = None + + def __len__(self): + return len(self.nodes) + + def is_leaf(self): + return len(self.nodes) == 0 + + def add(self, name, val=0): + prefix, subname = split_name(name) + if subname == '': + self.nodes[name] = SegTree() + self.nodes[name].val = val + return + if self.nodes.get(prefix) is None: + self.nodes[prefix] = SegTree() + self.nodes[prefix].add(subname, val) + + def change(self, name, val): + self.add(name, val) + + def __getitem__(self, name: str): + if hasattr(self, name): + return getattr(self, name) + val = self.nodes.get(name) + if val is None: + # straight lookup failed, do a prefix lookup + keys = list(self.nodes.keys()) + p_flag = [is_prefix(k, name) for k in keys] + if not any(p_flag): + return None + # either more than 1 match (error) or exactly 1 (success) + if np.sum(p_flag) > 1: + print(f"error: multiple matches of key {name} with {keys}") + else: + i = np.where(p_flag)[0][0] + n = numdots(keys[i]) + prefix, substr = split_name(name, n) + return self.nodes[prefix][substr] + return val + + +def model_to_tree(model): + keys = list(model.keys()) + tree = SegTree() + for k in keys: + tree.add(k, "leaf") + return tree + + +def get_args(): + parser = ArgumentParser() + parser.add_argument( + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to Huggingface UNet checkpoints", + ) + parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.") + parser.add_argument("--precision", type=str, default="32", help="Model precision") + parser.add_argument("--model", type=str, default="unet", required=True, choices=['unet', 'vae']) + parser.add_argument("--debug", action='store_true', help="Useful for debugging purposes.") + + args = parser.parse_args() + return args + + +def make_tiny_config(config): + '''dial down the config file to make things tractable''' + # TODO + return config + + +def load_hf_ckpt(in_dir, args): + ckpt = {} + with safetensors.safe_open(in_dir + "/diffusion_pytorch_model.safetensors", framework="pt") as f: + for k in f.keys(): + ckpt[k] = f.get_tensor(k) + return args, ckpt + + +def dup_convert_name_recursive(tree: SegTree, convert_name=None): + '''inside this tree, convert all nodes recursively + optionally, convert the name of the root as given by name (if not None) + ''' + if tree is None: + return + if convert_name is not None: + tree.convert_name = convert_name + # recursively copy the name into convert_name + for k, v in tree.nodes.items(): + dup_convert_name_recursive(v, k) + + +def sanity_check(hf_tree, hf_unet, nemo_unet): + # check if i'm introducing new keys + for hfk, nk in hf_to_nemo_mapping(hf_tree).items(): + if nk not in nemo_unet.keys(): + print(nk) + if hfk not in hf_unet.keys(): + print(hfk) + + +def convert_input_keys(hf_tree: SegTree): + '''map the input blocks of huggingface model''' + # map `conv_in` to first input block + dup_convert_name_recursive(hf_tree['conv_in'], 'input_blocks.0.0') + + # start counting blocks from now on + nemo_inp_blk = 1 + down_blocks = hf_tree['down_blocks'] + down_blocks_keys = sorted(list(down_blocks.nodes.keys()), key=intkey) + for downblockid in down_blocks_keys: + block = down_blocks[str(downblockid)] + # compute number of resnets, attentions, downsamplers in this block + resnets = block.nodes.get('resnets', SegTree()) + attentions = block.nodes.get('attentions', SegTree()) + downsamplers = block.nodes.get('downsamplers', SegTree()) + + if len(attentions) == 0: # no attentions, this is a DownBlock2d + for resid in sorted(list(resnets.nodes.keys()), key=intkey): + resid = str(resid) + resnets[resid].convert_name = f"input_blocks.{nemo_inp_blk}.0" + map_resnet_block(resnets[resid]) + nemo_inp_blk += 1 + elif len(attentions) == len(resnets): + # there are attention blocks here -- each resnet+attention becomes a block + for resid in sorted(list(resnets.nodes.keys()), key=intkey): + resid = str(resid) + resnets[resid].convert_name = f"input_blocks.{nemo_inp_blk}.0" + map_resnet_block(resnets[resid]) + attentions[resid].convert_name = f"input_blocks.{nemo_inp_blk}.1" + map_attention_block(attentions[resid]) + nemo_inp_blk += 1 + else: + logging.warning("number of attention blocks is not the same as resnets - whats going on?") + + # if there is a downsampler, then also append it + if len(downsamplers) > 0: + for k in downsamplers.nodes.keys(): + downsamplers[k].convert_name = f"input_blocks.{nemo_inp_blk}.{k}" + dup_convert_name_recursive(downsamplers[k]['conv'], 'op') + nemo_inp_blk += 1 + + +def clean_convert_names(tree): + tree.convert_name = None + for k, v in tree.nodes.items(): + clean_convert_names(v) + + +def map_attention_block(att_tree: SegTree): + '''this HF tree can either be an AttentionBlock or a DualAttention block + currently assumed AttentionBlock + + ''' + + # TODO (rohit): Add check for dual attention block + def check_att_type(tree): + return "att_block" + + if check_att_type(att_tree) == 'att_block': + dup_convert_name_recursive(att_tree['norm'], 'norm') + dup_convert_name_recursive(att_tree['proj_in'], 'proj_in') + dup_convert_name_recursive(att_tree['proj_out'], 'proj_out') + tblockids = list(att_tree['transformer_blocks'].nodes.keys()) + for t in tblockids: + tblock = att_tree[f'transformer_blocks.{t}'] + tblock.convert_name = f"transformer_blocks.{t}" + dup_convert_name_recursive(tblock['attn1'], 'attn1') + dup_convert_name_recursive(tblock['attn2'], 'attn2') + dup_convert_name_recursive(tblock['norm1'], 'attn1.norm') + dup_convert_name_recursive(tblock['norm2'], 'attn2.norm') + dup_convert_name_recursive(tblock['norm3'], 'ff.net.0') + # map ff module + tblock['ff'].convert_name = "ff" + tblock['ff.net'].convert_name = 'net' + dup_convert_name_recursive(tblock['ff.net.0'], '1') + dup_convert_name_recursive(tblock['ff.net.2'], '3') + else: + logging.warning("failed to identify type of attention block here.") + + +def map_resnet_block(resnet_tree: SegTree): + '''this HF tree is supposed to have all the keys for a resnet''' + dup_convert_name_recursive(resnet_tree.nodes.get('time_emb_proj'), 'emb_layers.1') + dup_convert_name_recursive(resnet_tree['norm1'], 'in_layers.0') + dup_convert_name_recursive(resnet_tree['conv1'], 'in_layers.1') + dup_convert_name_recursive(resnet_tree['norm2'], 'out_layers.0') + dup_convert_name_recursive(resnet_tree['conv2'], 'out_layers.2') + dup_convert_name_recursive(resnet_tree.nodes.get('conv_shortcut'), 'skip_connection') + + +def hf_to_nemo_mapping(tree: SegTree): + mapping = {} + for nodename, subtree in tree.nodes.items(): + convert_name = subtree.convert_name + convert_name = (convert_name + ".") if convert_name is not None else "" + if subtree.is_leaf() and subtree.convert_name is not None: + mapping[nodename] = subtree.convert_name + else: + submapping = hf_to_nemo_mapping(subtree) + for k, v in submapping.items(): + mapping[nodename + "." + k] = convert_name + v + return mapping + + +def convert_cond_keys(tree: SegTree): + # map all conditioning keys + tree['add_embedding'].convert_name = 'label_emb.0' + dup_convert_name_recursive(tree['add_embedding.linear_1'], '0') + dup_convert_name_recursive(tree['add_embedding.linear_2'], '2') + tree['time_embedding'].convert_name = 'time_embed' + dup_convert_name_recursive(tree['time_embedding.linear_1'], '0') + dup_convert_name_recursive(tree['time_embedding.linear_2'], '2') + + +def convert_middle_keys(tree: SegTree): + '''middle block is fixed (resnet -> attention -> resnet)''' + mid = tree['mid_block'] + resnets = mid['resnets'] + attns = mid['attentions'] + mid.convert_name = 'middle_block' + resnets['0'].convert_name = '0' + resnets['1'].convert_name = '2' + attns['0'].convert_name = '1' + map_resnet_block(resnets['0']) + map_resnet_block(resnets['1']) + map_attention_block(attns['0']) + + +def convert_output_keys(hf_tree: SegTree): + '''output keys is similar to input keys''' + nemo_inp_blk = 0 + up_blocks = hf_tree['up_blocks'] + up_blocks_keys = sorted(list(up_blocks.nodes.keys()), key=intkey) + + for downblockid in up_blocks_keys: + block = up_blocks[str(downblockid)] + # compute number of resnets, attentions, downsamplers in this block + resnets = block.nodes.get('resnets', SegTree()) + attentions = block.nodes.get('attentions', SegTree()) + upsamplers = block.nodes.get('upsamplers', SegTree()) + + if len(attentions) == 0: # no attentions, this is a DownBlock2d + for resid in sorted(list(resnets.nodes.keys()), key=intkey): + resid = str(resid) + resnets[resid].convert_name = f"output_blocks.{nemo_inp_blk}.0" + map_resnet_block(resnets[resid]) + nemo_inp_blk += 1 + + elif len(attentions) == len(resnets): + # there are attention blocks here -- each resnet+attention becomes a block + for resid in sorted(list(resnets.nodes.keys()), key=intkey): + resid = str(resid) + resnets[resid].convert_name = f"output_blocks.{nemo_inp_blk}.0" + map_resnet_block(resnets[resid]) + attentions[resid].convert_name = f"output_blocks.{nemo_inp_blk}.1" + map_attention_block(attentions[resid]) + nemo_inp_blk += 1 + else: + logging.warning("number of attention blocks is not the same as resnets - whats going on?") + + # if there is a downsampler, then also append it + if len(upsamplers) > 0: + # for k in upsamplers.nodes.keys(): + nemo_inp_blk -= 1 + upsamplers['0'].convert_name = f"output_blocks.{nemo_inp_blk}.2" + dup_convert_name_recursive(upsamplers['0.conv'], 'conv') + nemo_inp_blk += 1 + + +def convert_finalout_keys(hf_tree: SegTree): + dup_convert_name_recursive(hf_tree['conv_norm_out'], "out.0") + dup_convert_name_recursive(hf_tree['conv_out'], "out.1") + + +def convert_encoder(hf_tree: SegTree): + encoder = hf_tree['encoder'] + encoder.convert_name = 'encoder' + dup_convert_name_recursive(encoder['conv_in'], 'conv_in') + dup_convert_name_recursive(encoder['conv_out'], 'conv_out') + dup_convert_name_recursive(encoder['conv_norm_out'], 'norm_out') + + # each block contains resnets and downsamplers + # there are also optional attention blocks in the down module, but I havent encountered them yet + encoder['down_blocks'].convert_name = 'down' + for downid, downblock in encoder['down_blocks'].nodes.items(): + downblock.convert_name = downid + downsamplers = downblock.nodes.get('downsamplers', SegTree()) + dup_convert_name_recursive(downblock['resnets'], 'block') + # check for conv_shortcuts here + for resid, resnet in downblock['resnets'].nodes.items(): + if resnet.nodes.get('conv_shortcut') is not None: + resnet.nodes['conv_shortcut'].convert_name = 'nin_shortcut' + if len(downsamplers) > 0: + dup_convert_name_recursive(downsamplers['0'], 'downsample') + + # map the `mid_block` ( NeMo's mid layer is hardcoded in terms of number of modules) + encoder['mid_block'].convert_name = 'mid' + dup_convert_name_recursive(encoder[f'mid_block.resnets.0'], 'block_1') + dup_convert_name_recursive(encoder[f'mid_block.resnets.1'], 'block_2') + + # attention part + att = encoder['mid_block.attentions.0'] + att.convert_name = 'attn_1' + dup_convert_name_recursive(att['group_norm'], 'norm') + dup_convert_name_recursive(att['to_k'], 'k') + dup_convert_name_recursive(att['to_q'], 'q') + dup_convert_name_recursive(att['to_v'], 'v') + dup_convert_name_recursive(att['to_out.0'], 'proj_out') + + +def convert_decoder(hf_tree: SegTree): + decoder = hf_tree['decoder'] + decoder.convert_name = 'decoder' + dup_convert_name_recursive(decoder['conv_in'], 'conv_in') + dup_convert_name_recursive(decoder['conv_out'], 'conv_out') + dup_convert_name_recursive(decoder['conv_norm_out'], 'norm_out') + # each block contains resnets and downsamplers + # map the `mid_block` ( NeMo's mid layer is hardcoded in terms of number of modules) + decoder['mid_block'].convert_name = 'mid' + dup_convert_name_recursive(decoder[f'mid_block.resnets.0'], 'block_1') + dup_convert_name_recursive(decoder[f'mid_block.resnets.1'], 'block_2') + att = decoder['mid_block.attentions.0'] + att.convert_name = 'attn_1' + dup_convert_name_recursive(att['group_norm'], 'norm') + dup_convert_name_recursive(att['to_k'], 'k') + dup_convert_name_recursive(att['to_q'], 'q') + dup_convert_name_recursive(att['to_v'], 'v') + dup_convert_name_recursive(att['to_out.0'], 'proj_out') + + # up blocks contain resnets and upsamplers + decoder['up_blocks'].convert_name = 'up' + num_up_blocks = len(decoder['up_blocks']) + for upid, upblock in decoder['up_blocks'].nodes.items(): + upblock.convert_name = str(num_up_blocks - 1 - int(upid)) + upsamplers = upblock.nodes.get('upsamplers', SegTree()) + dup_convert_name_recursive(upblock['resnets'], 'block') + # check for conv_shortcuts here + for resid, resnet in upblock['resnets'].nodes.items(): + if resnet.nodes.get('conv_shortcut') is not None: + resnet.nodes['conv_shortcut'].convert_name = 'nin_shortcut' + if len(upsamplers) > 0: + dup_convert_name_recursive(upsamplers['0'], 'upsample') + + +def convert(args): + logging.info(f"loading checkpoint {args.input_name_or_path}") + _, hf_ckpt = load_hf_ckpt(args.input_name_or_path, args) + hf_tree = model_to_tree(hf_ckpt) + + if args.model == 'unet': + logging.info("converting unet...") + convert_input_keys(hf_tree) + convert_cond_keys(hf_tree) + convert_middle_keys(hf_tree) + convert_output_keys(hf_tree) + convert_finalout_keys(hf_tree) + # get mapping + + elif args.model == 'vae': + logging.info("converting vae...") + dup_convert_name_recursive(hf_tree['quant_conv'], 'quant_conv') + dup_convert_name_recursive(hf_tree['post_quant_conv'], 'post_quant_conv') + convert_encoder(hf_tree) + convert_decoder(hf_tree) + + else: + logging.error("incorrect model specification.") + return + + # check mapping + mapping = hf_to_nemo_mapping(hf_tree) + if len(mapping) != len(hf_ckpt.keys()): + logging.warning("not all keys are matched properly.") + nemo_ckpt = {} + + for hf_key, nemo_key in mapping.items(): + nemo_ckpt[nemo_key] = hf_ckpt[hf_key] + torch.save(nemo_ckpt, args.output_path) + logging.info(f"Saved nemo file to {args.output_path}") + + +if __name__ == '__main__': + args = get_args() + convert(args) From 8898b761e1bf21cf61b4fdd6dde4f0a37a20d060 Mon Sep 17 00:00:00 2001 From: Justin Kim Date: Tue, 9 Jul 2024 10:30:17 -0700 Subject: [PATCH 006/173] Triton deployment improvements for in-framework models (#9600) * add NemoQueryLLMPyTorch class for triton query of in-framework models * nemo_export.py changes to better support in-framework models * separate out in-framework version of triton deploy script * add generate() function to MegatronLLMDeployable to allow for direct use in export tests * use NemoQueryLLMPyTorch in deploy tests * add warning message for when MegatronLLMDeployable overrides transformer_engine * remove enable_streaming argument from deploy_inframework_triton.py since MegatronLLMDeployable does not support streaming add query_inframework.py since original query.py does not work with in-framework deployments * Apply isort and black reformatting Signed-off-by: jukim-nv * skip trtllm support check if in_framework testing * remove unused imports * run_existing_checkpoints was passing wrong prompts argument for in-framework mode * fix unused import in query_inframework.py --------- Signed-off-by: jukim-nv Co-authored-by: jukim-nv Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> --- nemo/deploy/nlp/__init__.py | 2 +- nemo/deploy/nlp/megatronllm_deployable.py | 20 ++- nemo/deploy/nlp/query_llm.py | 100 +++++++++++-- .../deploy/nlp/deploy_inframework_triton.py | 103 ++++++++++++++ scripts/deploy/nlp/query_inframework.py | 83 +++++++++++ tests/deploy/nemo_deploy.py | 4 +- tests/export/nemo_export.py | 134 +++++++++++------- 7 files changed, 376 insertions(+), 70 deletions(-) create mode 100755 scripts/deploy/nlp/deploy_inframework_triton.py create mode 100644 scripts/deploy/nlp/query_inframework.py diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py index a2110931c6df..5ebbe6816664 100644 --- a/nemo/deploy/nlp/__init__.py +++ b/nemo/deploy/nlp/__init__.py @@ -15,7 +15,7 @@ use_query_llm = True try: - from nemo.deploy.nlp.query_llm import NemoQueryLLM + from nemo.deploy.nlp.query_llm import NemoQueryLLM, NemoQueryLLMPyTorch except Exception: use_query_llm = False diff --git a/nemo/deploy/nlp/megatronllm_deployable.py b/nemo/deploy/nlp/megatronllm_deployable.py index c27bbbd0102b..1fe029f9fade 100644 --- a/nemo/deploy/nlp/megatronllm_deployable.py +++ b/nemo/deploy/nlp/megatronllm_deployable.py @@ -15,6 +15,7 @@ import logging from enum import IntEnum, auto from pathlib import Path +from typing import List import numpy as np import torch @@ -129,6 +130,12 @@ def _load_from_nemo_checkpoint(self, nemo_checkpoint_filepath: str, num_devices: nemo_checkpoint_filepath, trainer=trainer, return_config=True ) # transformer_engine should always be true according to EricH, but GPT-2B model will fail if it is enabled + if not custom_config.transformer_engine: + LOGGER.warning( + "MegatronLLMDeployable expects model config transformer_engine=True, but this model has it =False. " + "Overriding it to =True, but this may break certain checkpoints converted on older Nemo versions. " + "If your model breaks, please try re-converting the checkpoint on the current Nemo version." + ) custom_config.transformer_engine = True # using multi-gpu for tensor parallelism directly for now, could do pipeline parallel instead or a combination custom_config.tensor_model_parallel_size = num_devices @@ -233,9 +240,7 @@ def _length_params_from_triton_inputs(**inputs: np.ndarray): length_params[length_param_field] = inputs.pop(length_param_field)[0][0] return length_params - @batch - def triton_infer_fn(self, **inputs: np.ndarray): - """Triton server inference function that actually runs the model""" + def generate(self, inputs: List[str], length_params: LengthParam, sampling_params: SamplingParam): if torch.distributed.is_initialized(): distributed_rank = torch.distributed.get_rank() if distributed_rank != 0: @@ -245,13 +250,16 @@ def triton_infer_fn(self, **inputs: np.ndarray): signal_value = ServerSync.SIGNAL.to_long_tensor() torch.distributed.broadcast(signal_value, 0) + return self.model.generate(inputs=inputs, length_params=length_params, sampling_params=sampling_params) + + @batch + def triton_infer_fn(self, **inputs: np.ndarray): + """Triton server inference function that actually runs the model""" input_strings = str_ndarray2list(inputs.pop("prompts")) sampling_params = self._sampling_params_from_triton_inputs(**inputs) length_params = self._length_params_from_triton_inputs(**inputs) - model_output = self.model.generate( - inputs=input_strings, length_params=length_params, sampling_params=sampling_params - ) + model_output = self.generate(input_strings, length_params, sampling_params) ''' model_output['sentences'] will be a list of strings (one per prompt) other fields will either be a list of lists (tokens, for example) diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py index 940a927c7a54..71492520bf0a 100644 --- a/nemo/deploy/nlp/query_llm.py +++ b/nemo/deploy/nlp/query_llm.py @@ -30,23 +30,99 @@ def __init__(self, url, model_name): self.url = url self.model_name = model_name - @abstractmethod + +class NemoQueryLLMPyTorch(NemoQueryLLMBase): + """ + Sends a query to Triton for LLM inference + + Example: + from nemo.deploy import NemoTritonQueryLLMPyTorch + + nq = NemoTritonQueryLLMPyTorch(url="localhost", model_name="GPT-2B") + + prompts = ["hello, testing GPT inference", "another GPT inference test?"] + output = nq.query_llm( + prompts=prompts, + max_length=100, + top_k=1, + top_p=0.0, + temperature=0.0, + ) + print("prompts: ", prompts) + """ + + def __init__(self, url, model_name): + super().__init__( + url=url, + model_name=model_name, + ) + + # these arguments are explicitly defined in order to make it clear to user what they can pass + # names and optionality should exactly match the get_triton_input() results for MegatronGPTDeployable def query_llm( self, prompts, - stop_words_list=None, - bad_words_list=None, - no_repeat_ngram_size=None, - max_output_len=512, - top_k=1, - top_p=0.0, - temperature=1.0, - random_seed=None, - task_id=None, - lora_uids=None, + use_greedy: bool = None, + temperature: float = None, + top_k: int = None, + top_p: float = None, + repetition_penalty: float = None, + add_BOS: bool = None, + all_probs: bool = None, + compute_logprob: bool = None, + end_strings=None, + min_length: int = None, + max_length: int = None, init_timeout=60.0, ): - pass + """ + Query the Triton server synchronously and return a list of responses. + + Args: + prompts (List(str)): list of sentences. + use_greedy (bool): use greedy sampling, effectively the same as top_k=1 + temperature (float): A parameter of the softmax function, which is the last layer in the network. + top_k (int): limits us to a certain number (K) of the top tokens to consider. + top_p (float): limits us to the top tokens within a certain probability mass (p). + repetition_penalty (float): penalty applied to repeated sequences, 1.0 means no penalty. + add_BOS (bool): whether or not to add a BOS (beginning of sentence) token. + all_probs (bool): when using compute_logprob, returns probabilities for all tokens in vocabulary. + compute_logprob (bool): get back probabilities of all tokens in the sequence. + end_strings (List(str)): list of strings which will terminate generation when they appear in the output. + min_length (int): min generated tokens. + max_length (int): max generated tokens. + init_timeout (flat): timeout for the connection. + """ + prompts = str_list2numpy(prompts) + inputs = { + "prompts": prompts, + } + if use_greedy is not None: + inputs["use_greedy"] = np.full(prompts.shape, use_greedy, dtype=np.bool_) + if temperature is not None: + inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single) + if top_k is not None: + inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_) + if top_p is not None: + inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single) + if repetition_penalty is not None: + inputs["repetition_penalty"] = np.full(prompts.shape, repetition_penalty, dtype=np.single) + if add_BOS is not None: + inputs["add_BOS"] = np.full(prompts.shape, add_BOS, dtype=np.bool_) + if all_probs is not None: + inputs["all_probs"] = np.full(prompts.shape, all_probs, dtype=np.bool_) + if compute_logprob is not None: + inputs["compute_logprob"] = np.full(prompts.shape, compute_logprob, dtype=np.bool_) + if end_strings is not None: + inputs["end_strings"] = str_list2numpy(end_strings) + if min_length is not None: + inputs["min_length"] = np.full(prompts.shape, min_length, dtype=np.int_) + if max_length is not None: + inputs["max_length"] = np.full(prompts.shape, max_length, dtype=np.int_) + + with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client: + result_dict = client.infer_batch(**inputs) + return result_dict class NemoQueryLLM(NemoQueryLLMBase): diff --git a/scripts/deploy/nlp/deploy_inframework_triton.py b/scripts/deploy/nlp/deploy_inframework_triton.py new file mode 100755 index 000000000000..b698e4cbacfd --- /dev/null +++ b/scripts/deploy/nlp/deploy_inframework_triton.py @@ -0,0 +1,103 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import sys + +from nemo.deploy import DeployPyTriton + +LOGGER = logging.getLogger("NeMo") + +megatron_llm_supported = True +try: + from nemo.deploy.nlp import MegatronLLMDeployable +except Exception as e: + LOGGER.warning(f"Cannot import MegatronLLMDeployable, it will not be available. {type(e).__name__}: {e}") + megatron_llm_supported = False + + +def get_args(argv): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description=f"Deploy nemo models to Triton", + ) + parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file") + parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service") + parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service") + parser.add_argument( + "-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests" + ) + parser.add_argument( + "-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server" + ) + parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment") + parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model") + parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode") + args = parser.parse_args(argv) + return args + + +def get_nemo_deployable(args): + if args.nemo_checkpoint is None: + raise ValueError("In-Framework deployment requires a .nemo checkpoint") + + return MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus) + + +def nemo_deploy(argv): + args = get_args(argv) + + if args.debug_mode: + loglevel = logging.DEBUG + else: + loglevel = logging.INFO + + LOGGER.setLevel(loglevel) + LOGGER.info("Logging level set to {}".format(loglevel)) + LOGGER.info(args) + + if not megatron_llm_supported: + raise ValueError("MegatronLLMDeployable is not supported in this environment.") + triton_deployable = get_nemo_deployable(args) + + try: + nm = DeployPyTriton( + model=triton_deployable, + triton_model_name=args.triton_model_name, + triton_model_version=args.triton_model_version, + max_batch_size=args.max_batch_size, + port=args.triton_port, + address=args.triton_http_address, + ) + + LOGGER.info("Triton deploy function will be called.") + nm.deploy() + except Exception as error: + LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error)) + return + + try: + LOGGER.info("Model serving on Triton is will be started.") + nm.serve() + except Exception as error: + LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error)) + return + + LOGGER.info("Model serving will be stopped.") + nm.stop() + + +if __name__ == '__main__': + nemo_deploy(sys.argv[1:]) diff --git a/scripts/deploy/nlp/query_inframework.py b/scripts/deploy/nlp/query_inframework.py new file mode 100644 index 000000000000..e77ab72a1f04 --- /dev/null +++ b/scripts/deploy/nlp/query_inframework.py @@ -0,0 +1,83 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import sys + +from nemo.deploy.nlp.query_llm import NemoQueryLLMPyTorch + + +def get_args(argv): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description=f"Queries Triton server running an in-framework Nemo model", + ) + parser.add_argument("-u", "--url", default="0.0.0.0", type=str, help="url for the triton server") + parser.add_argument("-mn", "--model_name", required=True, type=str, help="Name of the triton model") + prompt_group = parser.add_mutually_exclusive_group(required=True) + prompt_group.add_argument("-p", "--prompt", required=False, type=str, help="Prompt") + prompt_group.add_argument("-pf", "--prompt_file", required=False, type=str, help="File to read the prompt from") + parser.add_argument("-mol", "--max_output_len", default=128, type=int, help="Max output token length") + parser.add_argument("-tk", "--top_k", default=1, type=int, help="top_k") + parser.add_argument("-tpp", "--top_p", default=0.0, type=float, help="top_p") + parser.add_argument("-t", "--temperature", default=1.0, type=float, help="temperature") + parser.add_argument("-it", "--init_timeout", default=60.0, type=float, help="init timeout for the triton server") + + args = parser.parse_args(argv) + return args + + +def query_llm( + url, + model_name, + prompts, + max_output_len=128, + top_k=1, + top_p=0.0, + temperature=1.0, + init_timeout=60.0, +): + nemo_query = NemoQueryLLMPyTorch(url, model_name) + return nemo_query.query_llm( + prompts=prompts, + max_length=max_output_len, + top_k=top_k, + top_p=top_p, + temperature=temperature, + init_timeout=init_timeout, + ) + + +def query(argv): + args = get_args(argv) + + if args.prompt_file is not None: + with open(args.prompt_file, "r") as f: + args.prompt = f.read() + + outputs = query_llm( + url=args.url, + model_name=args.model_name, + prompts=[args.prompt], + max_output_len=args.max_output_len, + top_k=args.top_k, + top_p=args.top_p, + temperature=args.temperature, + init_timeout=args.init_timeout, + ) + print(outputs["sentences"][0][0]) + + +if __name__ == '__main__': + query(sys.argv[1:]) diff --git a/tests/deploy/nemo_deploy.py b/tests/deploy/nemo_deploy.py index 5ef350b9c34a..5193fe951138 100644 --- a/tests/deploy/nemo_deploy.py +++ b/tests/deploy/nemo_deploy.py @@ -27,7 +27,7 @@ run_export_tests = True try: from nemo.deploy import DeployPyTriton - from nemo.deploy.nlp import NemoQueryLLM + from nemo.deploy.nlp import NemoQueryLLM, NemoQueryLLMPyTorch from nemo.export import TensorRTLLM except Exception as e: run_export_tests = False @@ -140,7 +140,7 @@ def run_in_framework_inference( ) nm.deploy() nm.run() - nq = NemoQueryLLM(url="localhost:8000", model_name=model_name) + nq = NemoQueryLLMPyTorch(url="localhost:8000", model_name=model_name) output_deployed = nq.query_llm( prompts=prompt, diff --git a/tests/export/nemo_export.py b/tests/export/nemo_export.py index 6073cff54423..6a296fdb92eb 100644 --- a/tests/export/nemo_export.py +++ b/tests/export/nemo_export.py @@ -40,7 +40,7 @@ in_framework_supported = True try: - from nemo.deploy.nlp import MegatronLLMDeployable + from nemo.deploy.nlp import MegatronLLMDeployable, NemoQueryLLMPyTorch except Exception as e: LOGGER.warning( f"Cannot import MegatronLLMDeployable, in-framework inference will not be available. {type(e).__name__}: {e}" @@ -101,52 +101,82 @@ def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path): for record in records: prompt = record["text_before_last_word"] expected_output = record["last_word"].strip().lower() - model_output = model.forward( - input_texts=[prompt], - max_output_len=1, - top_k=1, - top_p=0, - temperature=0.1, - task_ids=task_ids, - lora_uids=lora_uids, - ) - model_output = model_output[0][0].strip().lower() - all_expected_outputs.append(expected_output) - all_actual_outputs.append(model_output) + if model is not None: + if isinstance(model, MegatronLLMDeployable): + model_output = model.generate( + inputs=[prompt], + length_params={"min_length": 1, "max_length": 1}, + sampling_params={ + "use_greedy": True, + "temperature": 0.1, + "top_k": 1, + "top_p": 0, + "repetition_penalty": 1.0, + "add_BOS": True, + "all_probs": False, + "compute_logprob": False, + "end_strings": ["<|endoftext|>", ""], + }, + ) + # MegatronLLMDeployable returns prompt + generated output, so need to slice off prompt + model_output = model_output["sentences"][0][len(prompt) :].strip().lower() + else: + model_output = model.forward( + input_texts=[prompt], + max_output_len=1, + top_k=1, + top_p=0, + temperature=0.1, + task_ids=task_ids, + lora_uids=lora_uids, + ) + model_output = model_output[0][0].strip().lower() + all_actual_outputs.append(model_output) + + if expected_output == model_output: + correct_answers += 1 - if expected_output == model_output: - correct_answers += 1 - - if ( - expected_output == model_output - or model_output.startswith(expected_output) - or expected_output.startswith(model_output) - ): - if len(model_output) == 1 and len(expected_output) > 1: - continue - correct_answers_relaxed += 1 + if ( + expected_output == model_output + or model_output.startswith(expected_output) + or expected_output.startswith(model_output) + ): + if len(model_output) == 1 and len(expected_output) > 1: + continue + correct_answers_relaxed += 1 if nq is not None: - trtllm_deployed_output = nq.query_llm( - prompts=[prompt], - max_output_len=1, - top_k=1, - top_p=0, - temperature=0.1, - task_id=task_ids, - ) - trtllm_deployed_output = trtllm_deployed_output[0][0].strip().lower() - - if expected_output == trtllm_deployed_output: + if isinstance(nq, NemoQueryLLMPyTorch): + deployed_output = nq.query_llm( + prompts=[prompt], + max_length=1, + top_k=1, + top_p=0, + temperature=0.1, + ) + # MegatronLLMDeployable returns prompt + generated output, so need to slice off prompt + deployed_output = deployed_output["sentences"][0][0][len(prompt) :].decode().strip().lower() + else: + deployed_output = nq.query_llm( + prompts=[prompt], + max_output_len=1, + top_k=1, + top_p=0, + temperature=0.1, + task_id=task_ids, + ) + deployed_output = deployed_output[0][0].strip().lower() + + if expected_output == deployed_output: correct_answers_deployed += 1 if ( - expected_output == trtllm_deployed_output - or trtllm_deployed_output.startswith(expected_output) - or expected_output.startswith(trtllm_deployed_output) + expected_output == deployed_output + or deployed_output.startswith(expected_output) + or expected_output.startswith(deployed_output) ): - if len(trtllm_deployed_output) == 1 and len(expected_output) > 1: + if len(deployed_output) == 1 and len(expected_output) > 1: continue correct_answers_deployed_relaxed += 1 eval_end = time.monotonic() @@ -459,7 +489,7 @@ def run_existing_checkpoints( if in_framework: return run_in_framework_inference( model_name=model_name, - prompts=model_info["model_type"], + prompts=model_info["prompt_template"], checkpoint_path=model_info["checkpoint"], num_gpus=tp_size, max_output_len=model_info["max_output_len"], @@ -534,14 +564,15 @@ def run_in_framework_inference( ) nm.deploy() nm.run() - nq = NemoQueryLLM(url="localhost:8000", model_name=model_name) + nq = NemoQueryLLMPyTorch(url="localhost:8000", model_name=model_name) output_deployed = nq.query_llm( - prompts=[prompts], - top_k=top_k, - top_p=top_p, - temperature=temperature, + prompts=prompts, top_k=top_k, top_p=top_p, temperature=temperature, max_length=max_output_len ) + output_deployed = output_deployed["sentences"] + # MegatronLLMDeployable will return the prompt + generated output, so cut off the prompt + for i, output in enumerate(output_deployed): + output = output[len(prompts[i]) :] # Unwrap the generator if needed output_deployed = list(output_deployed) @@ -550,7 +581,8 @@ def run_in_framework_inference( accuracy_result = None if run_accuracy: print("Start model accuracy testing ...") - accuracy_result = get_accuracy_with_lambada(None, nq, None, None, test_data_path) + # This script is not written with torch.distributed support in mind, so running non-deployed in-framework models on multiple devices will not work + accuracy_result = get_accuracy_with_lambada(deployed_model, nq, None, None, test_data_path) nm.stop() @@ -736,7 +768,7 @@ def str_to_bool(name: str, s: str) -> bool: def run_inference_tests(args): - if not args.use_vllm and not trt_llm_supported: + if not args.use_vllm and not args.in_framework and not trt_llm_supported: raise UsageError("TensorRT-LLM engine is not supported in this environment.") if args.use_vllm and not vllm_supported: @@ -788,7 +820,7 @@ def run_inference_tests(args): tps = tps * 2 else: - if args.model_dir is None: + if not args.in_framework and args.model_dir is None: raise Exception("When using custom checkpoints, --model_dir is required.") prompts = ["The capital of France is", "Largest animal in the sea is"] @@ -847,6 +879,8 @@ def run_inference_tests(args): accuracy_test_result = "PASS" print_separator = False print("============= Test Summary ============") + # in-framework tests will only return deployed model accuracy results for tps > 1 + deployed_tests_only = args.in_framework and args.max_tps > 1 for num_tps, results in result_dic.items(): functional_result, accuracy_result = results @@ -876,7 +910,9 @@ def optional_bool_to_pass_fail(b: Optional[bool]): print(f"Deployed Model Accuracy: {accuracy_result.deployed_accuracy:.4f}") print(f"Deployed Relaxed Model Accuracy: {accuracy_result.deployed_accuracy_relaxed:.4f}") print(f"Evaluation Time [s]: {accuracy_result.evaluation_time:.2f}") - if accuracy_result.accuracy_relaxed < 0.5: + if (deployed_tests_only and accuracy_result.deployed_accuracy_relaxed < 0.5) or ( + not deployed_tests_only and accuracy_result.accuracy_relaxed < 0.5 + ): accuracy_test_result = "FAIL" print("=======================================") From 2ee8646c50dad7859b3ccd79c62d1b2694de3db6 Mon Sep 17 00:00:00 2001 From: jbaczek <45043825+jbaczek@users.noreply.github.com> Date: Tue, 9 Jul 2024 23:26:04 +0200 Subject: [PATCH 007/173] Use FP8 in GPT TP2 test (#9451) * Use FP8 in GPT TP2 test Signed-off-by: Jan Baczek * Add hydra options to use TE, TP overlap and FP8 Signed-off-by: Jan Baczek * Override presence checks in hydra Signed-off-by: Jan Baczek * WIP: Add debug code Signed-off-by: Jan Baczek * Apply isort and black reformatting Signed-off-by: jbaczek * Add more debug code Signed-off-by: Jan Baczek * Apply isort and black reformatting Signed-off-by: jbaczek * Add more debug code Signed-off-by: Jan Baczek * Apply isort and black reformatting Signed-off-by: jbaczek * Remove debug code and change underlying transformer layer to TE Signed-off-by: Jan Baczek * Override hydra error Signed-off-by: Jan Baczek * Remove tp overlap from the test Signed-off-by: Jan Baczek * Change runner for fp8 tests Signed-off-by: Jan Baczek * fix Signed-off-by: Jan Baczek * Add tp overlap test Signed-off-by: Jan Baczek * Remove TP overlap from tests. It is unsupported in docker environment Signed-off-by: Jan Baczek * Adjust GPT PP2 test to use FP8. Change optimizer in TP2 test Signed-off-by: Jan Baczek * Remove env overrides form GPT PP2 test Signed-off-by: Jan Baczek --------- Signed-off-by: Jan Baczek Signed-off-by: jbaczek Co-authored-by: jbaczek Co-authored-by: Pablo Garay --- .github/workflows/cicd-main.yml | 65 +++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index d225ee3ab429..bd794f59ae32 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -2391,7 +2391,7 @@ jobs: L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure + runs-on: self-hosted-azure-gpus-2-h100 timeout-minutes: 10 container: image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} @@ -2403,6 +2403,21 @@ jobs: --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData + env: + # This is to improve p2p overlap on H100 + NVTE_FWD_LAYERNORM_SM_MARGIN: 8 + NVTE_BWD_LAYERNORM_SM_MARGIN: 8 + TORCH_NCCL_AVOID_RECORD_STREAMS: 1 + NCCL_MIN_NCHANNELS: 4 + # TP overlap is not supported in docker environment + #NVTE_UB_SPLIT_RS: 0 + #NVTE_UB_ATOMIC_GEMM_RS: 1 + #NVTE_RS_STRIDED_ATOMIC: 1 + #NVTE_UB_FP8_RS: 1 + # Increase p2p chunksize to 2MB + NCCL_P2P_NET_CHUNKSIZE: 2097152 + # Disable gc when switching to/from validation steps + NEMO_MANUAL_GC_IN_VALIDATION: 0 steps: - name: Checkout repository uses: actions/checkout@v4 @@ -2417,8 +2432,17 @@ jobs: trainer.max_steps=3 \ trainer.gradient_clip_val=1.0 \ exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + ++model.transformer_engine=True \ + ++model.fp8=True \ + ++model.fp8_hybrid=True \ + ++model.fp8_amax_history_len=1024 \ + ++model.fp8_amax_compute_algo=max \ + ++model.reduce_amax=True \ + ++model.use_te_rng_tracker=True \ + ++model.name=megatron_gpt_full_te_layer_autocast \ + model.ub_tp_comm_overlap=False \ model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ + model.optim.name=distributed_fused_adam \ model.optim.lr=2e-4 \ model.optim.sched.warmup_steps=1 \ model.optim.sched.constant_steps=1 \ @@ -2452,8 +2476,17 @@ jobs: trainer.gradient_clip_val=1.0 \ exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ exp_manager.resume_if_exists=True \ + ++model.transformer_engine=True \ + ++model.fp8=True \ + ++model.fp8_hybrid=True \ + ++model.fp8_amax_history_len=1024 \ + ++model.fp8_amax_compute_algo=max \ + ++model.reduce_amax=True \ + ++model.use_te_rng_tracker=True \ + ++model.name=megatron_gpt_full_te_layer_autocast \ + model.ub_tp_comm_overlap=False \ model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ + model.optim.name=distributed_fused_adam \ model.optim.lr=2e-4 \ model.optim.sched.warmup_steps=2 \ model.optim.sched.constant_steps=2 \ @@ -2945,10 +2978,11 @@ jobs: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml with: - RUNNER: self-hosted-azure + RUNNER: self-hosted-azure-gpus-2-h100 SCRIPT: | python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ trainer.devices=2 \ + trainer.accelerator=gpu \ trainer.log_every_n_steps=1 \ trainer.val_check_interval=2 \ trainer.limit_val_batches=2 \ @@ -2957,6 +2991,15 @@ jobs: trainer.precision=bf16 \ trainer.gradient_clip_val=1.0 \ exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + ++model.transformer_engine=True \ + ++model.fp8=True \ + ++model.fp8_hybrid=True \ + ++model.fp8_amax_history_len=1024 \ + ++model.fp8_amax_compute_algo=max \ + ++model.reduce_amax=True \ + ++model.use_te_rng_tracker=True \ + ++model.name=megatron_gpt_full_te_layer_autocast \ + model.ub_tp_comm_overlap=False \ model.pipeline_model_parallel_size=2 \ model.tensor_model_parallel_size=1 \ model.mcore_gpt=True \ @@ -2981,12 +3024,15 @@ jobs: model.hidden_size=256 \ model.num_attention_heads=8 \ model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ model.activations_checkpoint_num_layers=1 \ + model.data.validation_drop_last=False \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ trainer.devices=2 \ + trainer.accelerator=gpu \ trainer.log_every_n_steps=1 \ trainer.val_check_interval=2 \ trainer.limit_val_batches=2 \ @@ -2998,6 +3044,15 @@ jobs: model.megatron_amp_O2=True \ exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ exp_manager.resume_if_exists=True \ + ++model.transformer_engine=True \ + ++model.fp8=True \ + ++model.fp8_hybrid=True \ + ++model.fp8_amax_history_len=1024 \ + ++model.fp8_amax_compute_algo=max \ + ++model.reduce_amax=True \ + ++model.use_te_rng_tracker=True \ + ++model.name=megatron_gpt_full_te_layer_autocast \ + model.ub_tp_comm_overlap=False \ model.pipeline_model_parallel_size=2 \ model.tensor_model_parallel_size=1 \ model.optim.name=distributed_fused_adam \ @@ -3020,7 +3075,9 @@ jobs: model.hidden_size=256 \ model.num_attention_heads=8 \ model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ model.activations_checkpoint_num_layers=1 \ + model.data.validation_drop_last=False \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings AFTER_SCRIPT: | From f5d52217f5f0f211830648458ebb7e7bad0e5f61 Mon Sep 17 00:00:00 2001 From: jomitchellnv <148147880+jomitchellnv@users.noreply.github.com> Date: Wed, 10 Jul 2024 01:19:41 -0700 Subject: [PATCH 008/173] enables default data step in megatron parallel to operate on a wider variety of tensors (#9641) * enables default data step in megatron parallel to operate on a wider variety of tensors coming out of the dataloader * handles the case where a batch is empty * Apply isort and black reformatting Signed-off-by: jomitchellnv * Allows the default data step to operate on more types than just dictionaries Signed-off-by: Jonathan Mitchell --------- Signed-off-by: jomitchellnv Signed-off-by: Jonathan Mitchell Co-authored-by: jomitchellnv Co-authored-by: Marc Romeyn --- nemo/lightning/megatron_parallel.py | 42 ++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index 2f2308717004..73913ada0cff 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -25,9 +25,11 @@ import torch import torch.distributed +from megatron.core import parallel_state from megatron.core.distributed import DistributedDataParallel as McoreDDP from megatron.core.distributed import DistributedDataParallelConfig from megatron.core.transformer.transformer_config import TransformerConfig +from pytorch_lightning.utilities import move_data_to_device from torch import Tensor, nn from typing_extensions import override @@ -43,15 +45,43 @@ def convert_output(self, output: torch.Tensor) -> torch.Tensor: ... def default_data_step(dataloader_iter: Iterator[DataT]) -> DataT: - batch = next(dataloader_iter) + """ + Moves the data to a device. + + In this case we utilize the match function to unpack the dataloader iterator. There may be a wrapper on the dataloader + iter from here: https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/fabric/strategies.py#L441. - if isinstance(batch, tuple) and len(batch) == 3: - batch = batch[0] + This will not subset the data for your with context parallel so please override this function if you + want to use context parallel. - if isinstance(batch, dict): - batch = {k: v.cuda(non_blocking=True) for k, v in batch.items()} + Examples: + If the dataloader_iter returns: [Tuple[, , ]] -> move to device + If the dataloader_iter returns: [, ] -> move to device - return batch + Returns: + DataT: The data moved to the device. + """ + if parallel_state.get_context_parallel_world_size() > 1: + raise ValueError( + "Default data step is being used in a context parallel environment." + "Please define your own data step that appropriately slices the data for context parallel." + ) + + match next(dataloader_iter): + # If its wrapped in a tuple, unpack it. + case (batch, int(_), int(_)): + pass + # Canonical case. + case batch: + pass + # If the dataloader_iter is empty, return a ValueError. + case _: + batch = None + + if batch is not None: + return move_data_to_device(batch, torch.cuda.current_device()) + else: + raise ValueError("None returned from dataloader.") def default_forward_step(model: nn.Module, batch, *args, **kwargs) -> torch.Tensor: From 355d3c53cd18dfa12f1166691f2d2875d1e96247 Mon Sep 17 00:00:00 2001 From: Marc Romeyn Date: Wed, 10 Jul 2024 17:38:07 +0200 Subject: [PATCH 009/173] =?UTF-8?q?Revert=20"enables=20default=20data=20st?= =?UTF-8?q?ep=20in=20megatron=20parallel=20to=20operate=20on=20a=20wider?= =?UTF-8?q?=20=E2=80=A6"=20(#9666)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nemo/lightning/megatron_parallel.py | 42 +++++------------------------ 1 file changed, 6 insertions(+), 36 deletions(-) diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index 73913ada0cff..2f2308717004 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -25,11 +25,9 @@ import torch import torch.distributed -from megatron.core import parallel_state from megatron.core.distributed import DistributedDataParallel as McoreDDP from megatron.core.distributed import DistributedDataParallelConfig from megatron.core.transformer.transformer_config import TransformerConfig -from pytorch_lightning.utilities import move_data_to_device from torch import Tensor, nn from typing_extensions import override @@ -45,43 +43,15 @@ def convert_output(self, output: torch.Tensor) -> torch.Tensor: ... def default_data_step(dataloader_iter: Iterator[DataT]) -> DataT: - """ - Moves the data to a device. - - In this case we utilize the match function to unpack the dataloader iterator. There may be a wrapper on the dataloader - iter from here: https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/fabric/strategies.py#L441. + batch = next(dataloader_iter) - This will not subset the data for your with context parallel so please override this function if you - want to use context parallel. + if isinstance(batch, tuple) and len(batch) == 3: + batch = batch[0] - Examples: - If the dataloader_iter returns: [Tuple[, , ]] -> move to device - If the dataloader_iter returns: [, ] -> move to device + if isinstance(batch, dict): + batch = {k: v.cuda(non_blocking=True) for k, v in batch.items()} - Returns: - DataT: The data moved to the device. - """ - if parallel_state.get_context_parallel_world_size() > 1: - raise ValueError( - "Default data step is being used in a context parallel environment." - "Please define your own data step that appropriately slices the data for context parallel." - ) - - match next(dataloader_iter): - # If its wrapped in a tuple, unpack it. - case (batch, int(_), int(_)): - pass - # Canonical case. - case batch: - pass - # If the dataloader_iter is empty, return a ValueError. - case _: - batch = None - - if batch is not None: - return move_data_to_device(batch, torch.cuda.current_device()) - else: - raise ValueError("None returned from dataloader.") + return batch def default_forward_step(model: nn.Module, batch, *args, **kwargs) -> torch.Tensor: From 74e32c8a4fe368d7e66948a8e1258fd40ad0586c Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Wed, 10 Jul 2024 12:26:29 -0400 Subject: [PATCH 010/173] Contrastive Reranker/Reward model (#9171) * wip contrastive reranker Signed-off-by: arendu * wip Signed-off-by: arendu * wip Signed-off-by: arendu * working reranker training and validation Signed-off-by: arendu * default peft for reranker Signed-off-by: arendu * validation time update Signed-off-by: arendu * reranker test Signed-off-by: arendu * reranker inference Signed-off-by: arendu * reranker inference Signed-off-by: arendu * Apply isort and black reformatting Signed-off-by: arendu * updates Signed-off-by: arendu * Apply isort and black reformatting Signed-off-by: arendu * updates Signed-off-by: arendu * Apply isort and black reformatting Signed-off-by: arendu * also can support rlhf style reward model loss Signed-off-by: arendu * Apply isort and black reformatting Signed-off-by: arendu * Apply isort and black reformatting Signed-off-by: arendu * typo in cicd Signed-off-by: arendu --------- Signed-off-by: arendu Signed-off-by: arendu Signed-off-by: Adi Renduchintala Co-authored-by: arendu --- .github/workflows/cicd-main.yml | 41 +++ ...megatron_gpt_embedder_generate_config.yaml | 1 - .../megatron_gpt_embedder_tuning_config.yaml | 2 +- .../megatron_gpt_reranker_tuning_config.yaml | 222 +++++++++++++ .../megatron_gpt_embedding_generate.py | 5 +- .../megatron_gpt_reranker_finetuning.py | 76 +++++ .../megatron_gpt_reranker_generate.py | 138 ++++++++ .../tuning/megatron_gpt_finetuning.py | 2 +- .../gpt_embedding_dataset.py | 139 +++++++- .../megatron_gpt_embedding_model.py | 48 +-- .../megatron_gpt_reranker_model.py | 301 ++++++++++++++++++ .../language_modeling/megatron_gpt_model.py | 58 ++-- .../common/megatron/adapters/mcore_mixins.py | 33 ++ .../megatron/adapters/parallel_adapters.py | 65 +++- .../nlp/parts/mixins/nlp_adapter_mixins.py | 17 +- nemo/collections/nlp/parts/peft_config.py | 18 ++ 16 files changed, 1115 insertions(+), 51 deletions(-) create mode 100644 examples/nlp/information_retrieval/conf/megatron_gpt_reranker_tuning_config.yaml create mode 100644 examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py create mode 100644 examples/nlp/information_retrieval/megatron_gpt_reranker_generate.py create mode 100644 nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index bd794f59ae32..10cd8d1e6561 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -3198,6 +3198,47 @@ jobs: - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" if: "failure()" + L2_Megatron_GPT_Reranker: + needs: [cicd-test-container-setup] + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + rm -rf /home/TestData/nlp/megatron_ir/working_dir + + python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \ + exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \ + model.global_batch_size=4 \ + model.micro_batch_size=4 \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + trainer.max_epochs=null \ + trainer.max_steps=20 \ + trainer.val_check_interval=10 \ + model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ + model.peft.lora_tuning.adapter_dim=8 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \ + model.data.validation_ds.write_embeddings_to_file=True \ + model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] + + + rm -rf /home/TestData/nlp/megatron_ir/working_dir + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" + L2_Megatron_GPT_Embedding: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml index 1a81d21dd9a8..e407aec167e9 100644 --- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml +++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml @@ -120,7 +120,6 @@ model: tunable_base_param_names: ["self_attention", "word_embeddings"] # TODO: regex support @adithyre data: - return_output_tensors: True test_ds: query_file_names: ??? # Path to a list of JSONL files corresponding to the query data. Data format is identical to validation_ds. doc_file_names: ??? # Path to a list of JSONL files corresponding to the doc data. Data format is identical to validation_ds. diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml index 6677dc2ed46c..1c2db1a862f4 100644 --- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml +++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml @@ -84,6 +84,7 @@ model: use_flash_attention: True precision: bf16 apply_rope_fusion: False + reward_model_loss: False # Set this to true to perform RLHF style reward model loss -log(sigmoid(accept_logit - reject_logit)) peft: peft_scheme: "lora" # can be either adapter,ia3, or ptuning @@ -126,7 +127,6 @@ model: tunable_base_param_names: ["self_attention", "word_embeddings"] # TODO: regex support @adithyre data: - return_output_tensors: True train_ds: # Example of how to specify paths to multiple datasets # file_names: diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_reranker_tuning_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_reranker_tuning_config.yaml new file mode 100644 index 000000000000..863b5fb475a0 --- /dev/null +++ b/examples/nlp/information_retrieval/conf/megatron_gpt_reranker_tuning_config.yaml @@ -0,0 +1,222 @@ +name: megatron_gpt_peft_reranker_tuning + +trainer: + devices: 1 + accelerator: gpu + num_nodes: 1 + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 # frequency with which training steps are logged + val_check_interval: ${trainer.max_steps} # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: null + num_sanity_val_steps: 0 + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: ${name} + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${model.data.validation_ds.metric.name} + save_top_k: 1 + mode: min + save_nemo_on_train_end: True + filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${model.tensor_model_parallel_size} + always_save_nemo: False + save_best_model: True + create_early_stopping_callback: False + early_stopping_callback_params: + monitor: "val_loss" + mode: "min" + min_delta: 0.001 + patience: 10 + verbose: True + strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + + global_batch_size: 128 + micro_batch_size: 4 + restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: True + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: selective # 'selective' or 'full' + activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + activations_checkpoint_layers_per_pipeline: null + gradient_as_bucket_view: False + + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + temperature: 0.02 + num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only + use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only + post_process: False # should be False. + apply_rope_fusion: False + transformer_engine: True # required to be True for newer versions of Megatron-LM based models + mcore_gpt: True # required to be True for newer versions of Megatron-LM based models + use_flash_attention: True + precision: bf16 + + peft: + peft_scheme: "mlp_head,lora" # can be either adapter,ia3, or ptuning + restore_from_path: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + lora_tuning: + target_modules: ['attention_qkv', 'attention_dense', 'mlp_fc1', 'mlp_fc2'] # + adapter_dim: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + # Instead of using the GPT LM Head, we can use a custom head for the reranking task + mlp_head_tuning: + out_features: 1 + + ia3_tuning: + layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + + selective_tuning: + tunable_base_param_names: ["self_attention", "word_embeddings"] # TODO: regex support @adithyre + + data: + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_names: ??? # Path to a list of JSONL files corresponding to the source data. + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + shuffle: True + num_workers: 0 + memmap_workers: 2 + pin_memory: True + max_seq_length: 512 # Even if the base model can handle longer sequences, 512 is generally a good choice for training efficiency. + min_seq_length: 1 + drop_last: True + # Example of how to specify concat_sampling_probabilities + # concat_sampling_probabilities: + # - 0.5 + # - 0.25 + # - 0.25 + concat_sampling_probabilities: + - 1.0 + label_key: 'output' + add_eos: True + add_bos: False + index_mapping_dir: null # Path to a directory to write index mapping files. + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + validation_ds: + file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: ["validation"] # Names of the corresponding datasets used to log metrics. + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: ${model.data.train_ds.max_seq_length} + min_seq_length: 1 + drop_last: False + label_key: ${model.data.train_ds.label_key} + add_eos: ${model.data.train_ds.add_eos} + add_bos: ${model.data.train_ds.add_bos} + write_embeddings_to_file: False + output_file_path_prefix: "validation_rankings" # Prefix of the file to write predictions to. + index_mapping_dir: null # Path to a directory to write index mapping files. + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + test_ds: + file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: ${model.data.train_ds.max_seq_length} + min_seq_length: 1 + drop_last: False + add_eos: ${model.data.train_ds.add_eos} + add_bos: ${model.data.train_ds.add_bos} + write_predictions_to_file: True + output_file_path_prefix: "test_embeddings" # Prefix of the file to write predictions to. + index_mapping_dir: null # Path to a directory to write index mapping files. + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + optim: + name: fused_adam + lr: 1e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 50 + min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 + constant_steps: 0 # Constant steps should also be 0 when min_lr=0 + monitor: val_loss + reduce_on_plateau: false \ No newline at end of file diff --git a/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py b/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py index 8cddcebbab62..d66ddb339773 100644 --- a/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py +++ b/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py @@ -68,7 +68,9 @@ def use_inference_server(cfg, model, trainer): web_ui = get_demo loop = asyncio.new_event_loop() thread = threading.Thread( - target=web_ui, daemon=True, args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop), + target=web_ui, + daemon=True, + args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop), ) thread.start() server = MegatronServer(model.cuda()) @@ -93,7 +95,6 @@ def main(cfg) -> None: model_cfg = MegatronGPTEmbeddingModel.merge_inference_cfg(cfg.model.restore_from_path, cfg) with open_dict(model_cfg): - model_cfg.data.return_output_tensors = True model_cfg.post_process = False model = MegatronGPTEmbeddingModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) diff --git a/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py b/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py new file mode 100644 index 000000000000..cf65840bb843 --- /dev/null +++ b/examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import MutableMapping + +import torch.multiprocessing as mp +from omegaconf.omegaconf import OmegaConf +from pytorch_lightning.loggers import WandbLogger + +from nemo.collections.nlp.models.information_retrieval.megatron_gpt_reranker_model import MegatronGPTRerankerModel +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder +from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager + +mp.set_start_method("spawn", force=True) + + +def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.') -> MutableMapping: + items = [] + for k, v in d.items(): + new_key = parent_key + sep + k if parent_key else k + if isinstance(v, MutableMapping): + items.extend(flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + + +@hydra_runner(config_path="conf", config_name="megatron_gpt_reranker_tuning_config") +def main(cfg) -> None: + logging.info("\n\n************** Experiment configuration ***********") + logging.info(f'\n{OmegaConf.to_yaml(cfg)}') + + trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() + exp_manager(trainer, cfg.exp_manager) + + model_cfg = MegatronGPTRerankerModel.merge_cfg_with(cfg.model.restore_from_path, cfg) + if trainer.global_rank == 0: + for logger in trainer.loggers: + if isinstance(logger, WandbLogger): + fd = flatten_dict(dict(model_cfg), sep="/") + logger.experiment.config.update(fd) + model = MegatronGPTRerankerModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) + peft_cfg_cls_lst = [PEFT_CONFIG_MAP[s] for s in cfg.model.peft.peft_scheme.split(",")] + peft_cfg_cls = [_peft_cfg(model_cfg) for _peft_cfg in peft_cfg_cls_lst] + + if cfg.model.peft.restore_from_path is not None: + # initialize peft weights from a checkpoint instead of randomly + # This is not the same as resume training because optimizer states are not restored. + logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path) + model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls) + elif peft_cfg_cls is not None: + logging.info("Adding adapter weights to the model for PEFT") + # model.add_adapter(peft_cfg_cls(model_cfg)) + model.add_adapter(peft_cfg_cls) + else: + logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}") + + trainer.fit(model) + + +if __name__ == '__main__': + main() diff --git a/examples/nlp/information_retrieval/megatron_gpt_reranker_generate.py b/examples/nlp/information_retrieval/megatron_gpt_reranker_generate.py new file mode 100644 index 000000000000..a91449c3deda --- /dev/null +++ b/examples/nlp/information_retrieval/megatron_gpt_reranker_generate.py @@ -0,0 +1,138 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio +import os +import threading +from functools import partial + +import torch +import torch.multiprocessing as mp +from omegaconf.omegaconf import OmegaConf, open_dict + +from nemo.collections.nlp.models.information_retrieval.megatron_gpt_reranker_model import MegatronGPTRerankerModel +from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer +from nemo.collections.nlp.modules.common.text_generation_utils import generate +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder +from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.model_utils import inject_model_parallel_rank + +try: + from megatron.core import parallel_state + + HAVE_MEGATRON_CORE = True +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False + +mp.set_start_method("spawn", force=True) + + +def use_inference_server(cfg, model, trainer): + if not HAVE_MEGATRON_CORE: + raise ValueError('Megatron-core needs to be installed to use this feature!') + + from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo + + trainer.test(model, dataloaders=None) + + if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0: + if cfg.web_server: + if cfg.chat: + defaults = { + 'user': cfg.chatbot_config.user, + 'assistant': cfg.chatbot_config.assistant, + 'system': cfg.chatbot_config.system, + } + web_ui = partial( + get_chatbot_demo, + defaults=defaults, + value=cfg.chatbot_config.value, + attributes=cfg.chatbot_config.attributes, + ) + else: + web_ui = get_demo + loop = asyncio.new_event_loop() + thread = threading.Thread( + target=web_ui, + daemon=True, + args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop), + ) + thread.start() + server = MegatronServer(model.cuda()) + server.run("0.0.0.0", port=cfg.port) + + while True: + choice = torch.cuda.LongTensor(1) + torch.distributed.broadcast(choice, 0) + if choice[0].item() == 0: + generate(model.cuda()) + + +@hydra_runner(config_path="conf", config_name="megatron_gpt_reranker_generate_config") +def main(cfg) -> None: + logging.info("\n\n************** Experiment configuration ***********") + logging.info(f"\n{OmegaConf.to_yaml(cfg)}") + trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() + + if cfg.model.peft.restore_from_path: + model_cfg = MegatronGPTRerankerModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg) + else: + model_cfg = MegatronGPTRerankerModel.merge_inference_cfg(cfg.model.restore_from_path, cfg) + + with open_dict(model_cfg): + model_cfg.post_process = False + + model = MegatronGPTRerankerModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) + + if cfg.model.peft.restore_from_path: + model.load_adapters(cfg.model.peft.restore_from_path) + elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name: + peft_cfg_cls_lst = [PEFT_CONFIG_MAP[s] for s in cfg.model.peft.peft_scheme.split(",")] + peft_cfg_cls = [_peft_cfg(model_cfg) for _peft_cfg in peft_cfg_cls_lst] + + checkpoint_path = os.path.join( + cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name + ) + # checkpoint_path is a dir in case of distributed checkpointing + if not os.path.isdir(checkpoint_path): + # legacy checkpoint needs model parallel rank injection + checkpoint_path = inject_model_parallel_rank( + os.path.join( + cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name + ) + ) + model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls) + else: + raise NotImplementedError("distributed checkpointing of PEFT weights is not supported") + + model.freeze() + logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}") + + if not cfg.model.get('use_flash_attention', False): + cfg.inference.compute_attention_mask = True + config = OmegaConf.to_container(cfg.inference, resolve=True) + model.set_inference_config(config) + + if not cfg.server: + trainer.test(model) + else: + use_inference_server(cfg, model, trainer) + + +if __name__ == "__main__": + main() diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py index aaa087a46623..bfe8ea35960e 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py b/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py index e697d5ec3bf6..3a2a8152313e 100644 --- a/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py +++ b/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py @@ -27,7 +27,7 @@ from nemo.core.classes import Dataset from nemo.utils import logging -__all__ = ['GPTEmbeddingDataset'] +__all__ = ['GPTEmbeddingDataset', 'GPTRerankerDataset'] class GPTEmbeddingDataset(Dataset): @@ -49,7 +49,7 @@ def __init__( data_type: str = 'train', # train, query or doc ): """ - file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format. + file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format. tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. @@ -279,3 +279,138 @@ def collate_fn(self, batch): } return processed_batch + + +class GPTRerankerDataset(GPTEmbeddingDataset): + def __init__( + self, + file_path: str, + tokenizer: TokenizerSpec, + max_seq_length: int = 1024, + min_seq_length: int = 1, + add_bos: bool = False, + add_eos: bool = True, + max_num_samples: int = None, + seed: int = 1234, + index_mapping_dir: str = None, + virtual_tokens: int = 0, + memmap_workers: Optional[int] = None, + truncation_method: str = 'right', + special_tokens: Optional[Mapping[str, str]] = None, # special tokens, a dictory of {token_type: token} + data_type: str = 'train', # train, query or doc + ): + """ + file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format. + tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). + max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. + min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. + add_bos (bool): Whether to add a beginning of sentence token to each data example + add_eos (bool): Whether to add an end of sentence token to each data example + seed: Random seed for data shuffling. + max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded. + index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset. + truncation_method: Truncation from which position. Options: ['left', 'right'] + special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} + """ + super().__init__( + file_path=file_path, + tokenizer=tokenizer, + max_seq_length=max_seq_length, + min_seq_length=min_seq_length, + add_bos=add_bos, + add_eos=add_eos, + max_num_samples=max_num_samples, + seed=seed, + index_mapping_dir=index_mapping_dir, + virtual_tokens=virtual_tokens, + memmap_workers=memmap_workers, + truncation_method=truncation_method, + special_tokens=special_tokens, + data_type=data_type, + ) + + def _process_example(self, example): + """ + Create an example by concatenating text and answer. + Truncation is carried out when needed, but it is performed only on the prompt side. + BOS, EOS, and SEP, are added if specified. + """ + metadata = {k: v for k, v in example.items()} + if self.data_type == 'train': + qd = self.tokenizer.text_to_ids( + "query: " + example['query'].strip() + " passage: " + example['pos_doc'].strip() + ) + qnd = self.tokenizer.text_to_ids( + "query: " + example['query'].strip() + " passage: " + example['neg_doc'].strip() + ) + else: + qd = self.tokenizer.text_to_ids( + "query: " + example['query'].strip() + " passage: " + example['pos_doc'].strip() + ) + qnd = [] + + if self.virtual_tokens: + # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context + # these pad/eos tokens are placeholders for virtual tokens for ptuning (if used) + qd = [self.tokenizer.eos_id] * self.virtual_tokens + qd # type: ignore + qnd = [self.tokenizer.eos_id] * self.virtual_tokens + qnd # type: ignore + + if self.add_bos: + qd = [self.tokenizer.bos_id] + qd # type: ignore + qnd = [self.tokenizer.bos_id] + qnd # type: ignore + + # TODO: (@adithyare) should probably add a warning before truncation + qd = qd[: self.max_seq_length - 1] + qnd = qnd[: self.max_seq_length - 1] + + if self.add_eos: + qd = qd + [self.tokenizer.eos_id] # type: ignore + qnd = qnd + [self.tokenizer.eos_id] # type: ignore + + processed_example = { + 'query_pos_doc': qd, + 'query_neg_doc': qnd, + 'metadata': metadata, + } + + return processed_example + + def collate_fn(self, batch): + input_ids = [] + metadata = [] + lengths = [] + max_length = -1 + for item in batch: + metadata.append(item['metadata']) + if self.data_type == 'train': + input_ids.append(item['query_pos_doc']) + lengths.append(len(item['query_pos_doc'])) + input_ids.append(item['query_neg_doc']) + lengths.append(len(item['query_neg_doc'])) + max_length = max(max_length, len(item['query_pos_doc']), len(item['query_neg_doc'])) + else: + input_ids.append(item['query_pos_doc']) + lengths.append(len(item['query_pos_doc'])) + max_length = max(max_length, len(item['query_pos_doc'])) + + max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16)) + assert max_length <= self.max_seq_length + + attention_mask = [self._create_attention_mask(max_length) for _ in input_ids] + attention_mask = torch.stack(attention_mask) + position_ids = [list(range(max_length)) for _ in input_ids] + position_ids = torch.LongTensor(position_ids) + input_ids = torch.LongTensor( + self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id) + ) + lengths = torch.LongTensor(lengths) - 1 # subtract 1 to account for the eos token + + processed_batch = { + 'tokens': input_ids, + 'attention_mask': attention_mask, + 'loss_mask': lengths, + 'position_ids': position_ids, + 'metadata': metadata, + } + + return processed_batch diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py index 67fd2b1b6c62..c7565f45358e 100644 --- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py +++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py @@ -36,11 +36,6 @@ except (ImportError, ModuleNotFoundError): HAVE_MEGATRON_CORE = False -try: - - HAVE_APEX = True -except (ImportError, ModuleNotFoundError): - HAVE_APEX = False def listify(tensor): @@ -52,6 +47,17 @@ def listify(tensor): return l_tensor +def _gather_global_inbatch_representations(local_eos_tensor): + local_eos_tensor = local_eos_tensor.contiguous() + global_eos_tensors = [ + torch.zeros_like(local_eos_tensor) for _ in range(parallel_state.get_data_parallel_world_size()) + ] + torch.distributed.all_gather(global_eos_tensors, local_eos_tensor, group=parallel_state.get_data_parallel_group()) + global_eos_tensors[parallel_state.get_data_parallel_rank()] = local_eos_tensor + global_eos_tensors = torch.cat(global_eos_tensors, dim=0) + return global_eos_tensors + + class MegatronGPTEmbeddingModel(MegatronGPTSFTModel): def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) @@ -412,25 +418,20 @@ def inference_loss_func(self, loss_mask, num_valid_tokens_in_ub, eos_tensors): hs = eos_tensors hs = torch.nn.functional.normalize(hs, dim=1) _blank = torch.zeros(1, device=hs.device, dtype=hs.dtype)[0] - return _blank, hs, hs, _blank, _blank, _blank - - def _gather_global_inbatch_representations(self, local_eos_tensor): - local_eos_tensor = local_eos_tensor.contiguous() - global_eos_tensors = [ - torch.zeros_like(local_eos_tensor) for _ in range(parallel_state.get_data_parallel_world_size()) - ] - torch.distributed.all_gather( - global_eos_tensors, local_eos_tensor, group=parallel_state.get_data_parallel_group() - ) - global_eos_tensors[parallel_state.get_data_parallel_rank()] = local_eos_tensor - global_eos_tensors = torch.cat(global_eos_tensors, dim=0) - return global_eos_tensors + return { + "loss": _blank, + "query_hs": hs, + "pos_doc_hs": hs, + "pos_cs": _blank, + "neg_cs": _blank, + "diff_cs": _blank, + } def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor): idx = torch.arange(output_tensor.shape[1], device=output_tensor.device) eos_tensors = output_tensor[loss_mask, idx, :] if self.global_inbatch_negatives and self.trainer.training: - eos_tensors = self._gather_global_inbatch_representations(eos_tensors) + eos_tensors = _gather_global_inbatch_representations(eos_tensors) if not self.trainer.training: return self.inference_loss_func(loss_mask, num_valid_tokens_in_ub, eos_tensors) bs = eos_tensors.shape[0] // 3 @@ -464,4 +465,11 @@ def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor): query_hs = query_hs.clone().detach() pos_doc_hs = pos_doc_hs.clone().detach() diff_cs = pos_cs - neg_cs - return loss, query_hs, pos_doc_hs, pos_cs, neg_cs, diff_cs + return { + "loss": loss, + "query_hs": query_hs, + "pos_doc_hs": pos_doc_hs, + "pos_cs": pos_cs, + "neg_cs": neg_cs, + "diff_cs": diff_cs, + } diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py new file mode 100644 index 000000000000..e316871fe607 --- /dev/null +++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_reranker_model.py @@ -0,0 +1,301 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import os + +import numpy as np +import torch +from omegaconf import DictConfig, ListConfig +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.data.information_retrieval.gpt_embedding_dataset import GPTRerankerDataset +from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import ( + get_datasets_weights_and_num_samples, +) +from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset +from nemo.collections.nlp.models.information_retrieval.megatron_gpt_embedding_model import ( + MegatronGPTEmbeddingModel, + _gather_global_inbatch_representations, +) +from nemo.utils import logging + +try: + from megatron.core import parallel_state + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False + + +def listify(tensor): + l_tensor = [] + for t in tensor: + for rid in range(t.shape[0]): + r = t[rid, :].unsqueeze(0).cpu() + l_tensor.append(r) + return l_tensor + + +class MegatronGPTRerankerModel(MegatronGPTEmbeddingModel): + def __init__(self, cfg: DictConfig, trainer: Trainer): + self.reward_model_loss = cfg.get("reward_model_loss", False) + super().__init__(cfg, trainer=trainer) + + def model_provider_func(self, pre_process, post_process): + # (@adithyare) We need post_process to be False to get hidden states in the loss_func + return super().model_provider_func(pre_process, post_process=False) + + def maybe_setup_test(self): + if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None: + self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds) + return + + def maybe_build_test(self): + if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None: + logging.info('Building GPT Reranker test datasets.') + # Wrap this in a list since the general finetuning parent class supports multi-validation. + self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False) + + def _build_dataset(self, data_cfg, is_train=True): + packed_sequence = data_cfg.get("packed_sequence", False) + + # Determine if we are using a single dataset or a list of datasets. + if is_train: + # Construct the data prefix list for `get_datasets_weights_and_num_samples()` + # that is of the format [weight1,file_name1,weight2,file_name2,...] + if data_cfg.concat_sampling_probabilities is None or not isinstance( + data_cfg.concat_sampling_probabilities, ListConfig + ): + raise ValueError( + ( + f"concat_sampling_probabilities must be a ListConfig with the same number of files in file_names." + f"Found: {data_cfg.concat_sampling_probabilities}" + ) + ) + + if len(data_cfg.get('concat_sampling_probabilities', None)) != len(data_cfg.file_names): + raise ValueError( + ( + f"concat_sampling_probabilities must be of the same size as file_names.", + f"Provided size {len(data_cfg.concat_sampling_probabilities)}, number of datasets {len(data_cfg.file_names)}", + ) + ) + + data_prefix = [] + for weight, prefix in zip(data_cfg.concat_sampling_probabilities, data_cfg.file_names): + data_prefix.append(weight) + data_prefix.append(prefix) + + if self.trainer.max_steps is None or self.trainer.max_steps <= 0: + raise ValueError( + f'Trainer max_steps must be set to a positive integer. Found {self.trainer.max_steps}' + ) + num_train_samples = [self.trainer.max_steps * data_cfg.global_batch_size] + _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples) + num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset]) + else: + num_train_samples_per_dataset = [[None]] * len(data_cfg.file_names) + + # Check dataset max_seq_legnth and max_position_embeddings size + if ( + self.cfg.get('position_embedding_type', None) in [None, 'learned_absolute'] + and data_cfg.max_seq_length > self.cfg.max_position_embeddings + ): + logging.warning( + f"Set dataset max_seq_length to max_position_embeddings {self.cfg.max_position_embeddings} if using learned_absolute position embedding" + ) + data_cfg.max_seq_length = self.cfg.max_position_embeddings + + # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8 + # When using sequence parallel, sequence will further be split by TP size + pad_seq_length_to_mult = ( + 8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16 + ) + pad_seq_length_to_mult *= self.cfg.get('context_parallel_size', 1) + + datasets = [] + for file_path, num_samples in zip(data_cfg.file_names, num_train_samples_per_dataset): + dataset = GPTRerankerDataset( + file_path=file_path, + tokenizer=self.tokenizer, + max_seq_length=data_cfg.max_seq_length, + min_seq_length=data_cfg.min_seq_length, + add_bos=data_cfg.get('add_bos', False), + add_eos=data_cfg.get('add_eos', True), + max_num_samples=num_samples[0], + seed=data_cfg.get('seed', 1234), + index_mapping_dir=data_cfg.get('index_mapping_dir', None), + virtual_tokens=self.virtual_tokens, + memmap_workers=data_cfg.get( + 'memmap_workers', None + ), # used to set num. of workers to create the memmap index files + truncation_method=data_cfg.get( + 'truncation_method', 'right' + ), # used to choose truncation method. Options: ['random', 'left', 'right'] + special_tokens=self.cfg.data.get( + 'chat_prompt_tokens', None + ), # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} + data_type="train" if is_train else "validation", + ) + datasets.append(dataset) + if is_train: + if packed_sequence: + num_train_samples_after_blend = sum(len(dataset) for dataset in datasets) + dataset = BlendableDataset( + datasets=datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend + ) + return dataset + else: + return datasets + + def training_step_fwd_bwd_step_call(self, dataloader_iter, forward_only): + loss_mean, non_loss_tensors = self.fwd_bwd_step(dataloader_iter, forward_only) + logit_diff = non_loss_tensors['logit_diff'][0].item() + self.log("logit_diff", logit_diff, prog_bar=True, rank_zero_only=True, batch_size=1) + return loss_mean + + def inference_step_validation_call(self, batch, batch_idx, data_cfg, dataloader_idx=0): + metadata = batch.get('metadata', [{}] * len(batch['tokens'])) + loss, non_loss_tensors = self.local_validation_step(itertools.chain([dataloader_idx], [batch])) + outputs = { + 'loss': loss, + 'metadata': metadata, # [dict] + 'query_pos_doc_logit': non_loss_tensors['query_pos_doc_logit'], # [batch_size, hidden_size] + } + return outputs + + def inference_loss_func(self, loss_mask, num_valid_tokens_in_ub, eos_tensors): + query_pos_doc_hs = eos_tensors + _blank = torch.zeros(1, device=query_pos_doc_hs.device, dtype=query_pos_doc_hs.dtype)[0] + return { + "loss": _blank, + "query_pos_doc_logit": query_pos_doc_hs, + "query_neg_doc_logit": _blank, + "logit_diff": _blank, + } + + def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor): + idx = torch.arange(output_tensor.shape[1], device=output_tensor.device) + eos_tensors = output_tensor[loss_mask, idx, :] # (bs x 1) + if self.global_inbatch_negatives and self.trainer.training: + eos_tensors = _gather_global_inbatch_representations(eos_tensors) + if not self.trainer.training: + return self.inference_loss_func(loss_mask, num_valid_tokens_in_ub, eos_tensors) + bs = eos_tensors.shape[0] // 2 + query_pos_doc_hs = eos_tensors[::2, :] # every second tensor from idx 0 is a query w pos_doc (bs x 1) + query_neg_doc_hs = eos_tensors[1::2, :] # every second tensor from idx 1 is a query w negative doc (bs x 1) + + if self.reward_model_loss: + loss = -torch.nn.functional.logsigmoid(query_pos_doc_hs - query_neg_doc_hs).mean() + else: + cs = torch.cat([query_pos_doc_hs, query_neg_doc_hs], dim=1) # (bs x 2) + cs = cs / self.temperature + labels = torch.zeros(bs, device=cs.device).long() + loss = torch.nn.functional.cross_entropy(cs, labels) + + cp_size = self.cfg.get('context_parallel_size', 1) + if cp_size > 1: + torch.distributed.all_reduce(loss, group=parallel_state.get_context_parallel_group()) + query_pos_doc_hs = query_pos_doc_hs.clone().detach() + query_neg_doc_hs = query_neg_doc_hs.clone().detach() + logit_diffs = torch.mean(query_pos_doc_hs - query_neg_doc_hs) + return { + "loss": loss, + "query_pos_doc_logit": query_pos_doc_hs, + "query_neg_doc_logit": query_neg_doc_hs, + "logit_diff": logit_diffs, + } + + def gather_and_maybe_write_predictions(self, output, data_cfg, mode, averaged_metric, dataloader_idx=0): + if not data_cfg.get("write_embeddings_to_file", False): + return True + gathered_output_batches = [None for _ in range(parallel_state.get_data_parallel_world_size())] + torch.distributed.all_gather_object( + gathered_output_batches, + [ + { + 'query_pos_doc_logit': batch['query_pos_doc_logit'], + 'metadata': batch['metadata'], + } + for batch in output + ], + group=parallel_state.get_data_parallel_group(), + ) + + # Remove duplicate examples due to distributed sampler. + deduplicated_outputs = { + 'query_pos_doc_logit': [], + 'metadata': [], + } + total_size, skipped = 0, 0 + for rank in range(0, parallel_state.get_data_parallel_world_size()): + for batch in gathered_output_batches[rank]: + l_q_hs = listify(batch['query_pos_doc_logit']) + l_m = batch['metadata'] + assert len(l_m) == len(l_q_hs) + for q_hs, metadata in zip( + l_q_hs, + l_m, + ): + total_size += 1 + if not metadata.get("__AUTOGENERATED__", False): + deduplicated_outputs['query_pos_doc_logit'].append(q_hs) + deduplicated_outputs['metadata'].append(metadata) + else: + skipped += 1 + + logging.info( + f"{total_size-skipped} deduplicated outputs in dataloader:{dataloader_idx}, (skipped {skipped} autogenerated examples)." + ) + # Compute metric score + metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name + assert metric_name == "loss", "Only loss is supported for now." + # avg_pos_cs = torch.tensor(deduplicated_outputs['avg_pos_cs']).mean().item() + # avg_neg_cs = torch.tensor(deduplicated_outputs['avg_neg_cs']).mean().item() + # diff_cs = torch.tensor(deduplicated_outputs['diff_cs']).mean().item() + # self.log('val_avg_pos_cs', avg_pos_cs, prog_bar=True, rank_zero_only=True, batch_size=1) + # self.log('val_avg_neg_cs', avg_neg_cs, prog_bar=True, rank_zero_only=True, batch_size=1) + # self.log('val_diff_cs', diff_cs, prog_bar=True, rank_zero_only=True, batch_size=1) + + # Write predictions to file + if self.global_rank == 0 and data_cfg.get("write_embeddings_to_file", False): + logging.info( + f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['metadata'])}" + ) + + # Check if the user provided a prefix path to the file(s) they want to write. + if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None: + raise ValueError( + f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file." + ) + # (@adithyare) We are not using the log key to write the embeddings to file + filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode) + consumed_samples = self._compute_consumed_samples_after_training_step() + fldr_path = f"{data_cfg.output_file_path_prefix}/consumed_samples{consumed_samples}/{filename_log_key}" + self.write_embeddings_to_file(deduplicated_outputs, fldr_path, dataloader_idx) + return deduplicated_outputs, total_size + + def write_embeddings_to_file(self, outputs, output_file_path, d_idx): + hs = torch.cat(outputs['query_pos_doc_logit'], dim=0) + hs_npy = hs.float().numpy() + emb_fldr = f"{output_file_path}" + os.makedirs(emb_fldr, exist_ok=True) + with open(f"{output_file_path}/logits.ids", "w") as f: + for m in outputs['metadata']: + f.write(f"{m['query_id'].strip()} {m['doc_id']}\n") + np.save(f"{emb_fldr}/logits.npy", hs_npy) + return True diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 4f9722d900f6..69cd06021f50 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -391,7 +391,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.log_memory_usage = bool(int(os.getenv("NEMO_LOG_MEMORY_USAGE", 0))) self.loss_broadcast_src_rank = None data_cfg = cfg.get('data', {}) - self.return_output_tensors = data_cfg.get('return_output_tensors', False) self.validation_drop_last = data_cfg.get('validation_drop_last', True) self.sample_weight = data_cfg.get('sample_weight', 'token') self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False) @@ -1275,24 +1274,47 @@ def loss_func(output_tensor): # Loss for a micro-batch (ub) loss_for_ub = self.loss_func(batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor) cp_size = parallel_state.get_context_parallel_world_size() - if self.return_output_tensors: + if isinstance(loss_for_ub, dict): # TODO: need a better way to check if loss_func is returning more stuff than just loss... (@adithyare) - loss_for_ub, q_hs, d_hs, pos_cs, neg_cs, diff_cs = loss_for_ub - reduced_loss = average_losses_across_data_parallel_group([loss_for_ub]) - pos_cs = average_losses_across_data_parallel_group([pos_cs]) - neg_cs = average_losses_across_data_parallel_group([neg_cs]) - diff_cs = average_losses_across_data_parallel_group([diff_cs]) - return ( - loss_for_ub * cp_size, - { - 'avg': reduced_loss, - 'query_hs': q_hs, - 'doc_hs': d_hs, - 'avg_pos_cs': pos_cs, - 'avg_neg_cs': neg_cs, - 'diff_cs': diff_cs, - }, - ) + + if set(loss_for_ub.keys()) == set( + ["loss", "query_hs", "pos_doc_hs", "pos_cs", "neg_cs", "diff_cs"] + ): # (adithyare) this check will be True for GPT Embedding models + loss = loss_for_ub['loss'] + reduced_loss = average_losses_across_data_parallel_group([loss]) + pos_cs = average_losses_across_data_parallel_group([loss_for_ub['pos_cs']]) + neg_cs = average_losses_across_data_parallel_group([loss_for_ub['neg_cs']]) + diff_cs = average_losses_across_data_parallel_group([loss_for_ub['diff_cs']]) + return ( + loss * cp_size, + { + 'avg': reduced_loss, + 'query_hs': loss_for_ub['query_hs'], + 'doc_hs': loss_for_ub['pos_doc_hs'], + 'avg_pos_cs': pos_cs, + 'avg_neg_cs': neg_cs, + 'diff_cs': diff_cs, + }, + ) + elif set(loss_for_ub.keys()) == set( + ["loss", "query_pos_doc_logit", "query_neg_doc_logit", "logit_diff"] + ): # (adithyare) this check will be True for GPT Reranker models + + loss = loss_for_ub['loss'] + reduced_loss = average_losses_across_data_parallel_group([loss]) + logit_diff = average_losses_across_data_parallel_group([loss_for_ub['logit_diff']]) + return ( + loss * cp_size, + { + 'avg': reduced_loss, + 'query_pos_doc_logit': loss_for_ub['query_pos_doc_logit'], + 'query_neg_doc_logit': loss_for_ub['query_neg_doc_logit'], + 'logit_diff': logit_diff, + }, + ) + else: + raise RuntimeError(f"Dict loss_for_ub has unknown key set {loss_for_ub.keys()}") + elif validation_step and not self.validation_drop_last: num_valid_tokens_in_ub = batch['num_valid_tokens_in_ub'] if loss_for_ub.isnan(): diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py index 2f00f5907ad8..48b6afa788ae 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py @@ -14,17 +14,21 @@ import torch import torch.nn.functional as F +from megatron.core import InferenceParams from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb +from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.transformer.attention import SelfAttention from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim from megatron.core.transformer.mlp import MLP from megatron.core.transformer.moe.experts import SequentialMLP +from megatron.core.transformer.transformer_block import TransformerBlock from megatron.core.transformer.transformer_layer import TransformerLayer from megatron.core.utils import make_viewless_tensor +from torch import Tensor from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ( AdapterName, @@ -37,6 +41,7 @@ LoraMoeHto4HAdapterConfig, LoraUnfusedHto4HAdapterConfig, LoraUnfusedKQVAdapterConfig, + MLPHeadAdapterConfig, MLPInfusedAdapterConfig, ParallelLinearAdapterConfig, PromptEncoderAdapterConfig, @@ -61,6 +66,34 @@ def mcore_register_adapters(self): raise NotImplementedError("Mcore mixins should implement setup_adapters on a subclass of MyBase") +class MCoreTransformerBlockMixin(TransformerBlock, MCoreAdapterModuleMixin): + def mcore_register_adapters(self): + """ + Setup NeMo (canonical) Adapter to this MCore layer. + """ + self.set_accepted_adapter_types([MLPHeadAdapterConfig._target_]) + + def forward( + self, + hidden_states: Tensor, + attention_mask: Tensor, + context: Tensor = None, + context_mask: Tensor = None, + rotary_pos_emb: Tensor = None, + inference_params: InferenceParams = None, + packed_seq_params: PackedSeqParams = None, + ): + hidden_states = super().forward( + hidden_states, attention_mask, context, context_mask, rotary_pos_emb, inference_params, packed_seq_params + ) + + mlp_head_adapter = self.get_adapter_module(AdapterName.MLP_HEAD_ADAPTER) + if mlp_head_adapter and self.adapter_cfg[AdapterName.MLP_HEAD_ADAPTER]['enabled']: + hidden_states = mlp_head_adapter(hidden_states) + + return hidden_states + + class MCoreSelfAttentionMixin(SelfAttention, MCoreAdapterModuleMixin): def mcore_register_adapters(self): """ diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index 9ab1da7136a1..8d2d77c55cf2 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -77,6 +77,7 @@ class AdapterName(str, enum.Enum): PTUNING_ADAPTER = "ptuning_adapter" LORA_KQV_ADAPTER = "lora_kqv_adapter" LORA_UNFUSED_KQV_ADAPTER = "lora_unfused_kqv_adapter" + MLP_HEAD_ADAPTER = "mlp_head_adapter" LORA_KV_ADAPTER = "lora_kv_adapter" LORA_Q_ADAPTER = "lora_q_adapter" MM_LINEAR_ADAPTER = "mm_linear_adapter" @@ -388,6 +389,57 @@ class ParallelLinearAdapterConfig(AdapterConfig): _target_: str = "{0}.{1}".format(ParallelLinearAdapter.__module__, ParallelLinearAdapter.__name__) +class MLPHeadAdapter(nn.Module, AdapterModuleUtil): + def __init__( + self, + in_features: int, + out_features: int, + input_is_parallel: bool = False, + model_parallel_config: Optional[ModelParallelConfig] = None, + **kwargs, + ): + super().__init__() + if model_parallel_config is None: + model_parallel_config = ModelParallelConfig() + self._sequence_parallel = model_parallel_config.sequence_parallel + model_parallel_config.sequence_parallel = False # SP is irrelevant for the lora linear layer + + if input_is_parallel: + self.linear = RowParallelLinear( + in_features, + out_features, + config=model_parallel_config, + input_is_parallel=True, + skip_bias_add=True, + bias=False, + init_method=init.xavier_normal_, + ) + else: + self.linear = ColumnParallelLinear( + in_features, + out_features, + config=model_parallel_config, + bias=False, + gather_output=True, + init_method=init.xavier_normal_, + disable_grad_reduce=self._sequence_parallel, + ) + + # Setup adapter strategy + self.setup_adapter_strategy(adapter_mixin_strategies.ReturnResultAdapterStrategy()) + + def forward(self, x): + x, _ = self.linear(x) + return x + + +@dataclass +class MLPHeadAdapterConfig(AdapterConfig): + in_features: int + out_features: int + _target_: str = "{0}.{1}".format(MLPHeadAdapter.__module__, MLPHeadAdapter.__name__) + + class LoraKQVAdapter(ParallelLinearAdapter): """ Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes @@ -777,14 +829,21 @@ def set_inference_table(self, prompt_representation: torch.Tensor): self.is_inference_ready = True return True - def clear_inference_table(self): + def clear_inference_table( + self, + ): self.inference_table.fill_(0.0) self.is_inference_ready = False - def get_inference_table(self): + def get_inference_table( + self, + ): return self.inference_table.data - def inner_forward(self): + def inner_forward( + self, + ): + input_embeds = self.embedding(self.indices).unsqueeze(0) intermediate_parallel, bias_parallel = self.first(input_embeds) intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel) diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py index 2bacaf52e3f8..90b3912784c8 100644 --- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py +++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py @@ -30,8 +30,13 @@ HAVE_MEGATRON_CORE = False -from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import PromptEncoderAdapterConfig +from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ( + MLPHeadAdapterConfig, + PromptEncoderAdapterConfig, +) + from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector + from nemo.collections.nlp.parts.peft_config import ( PEFT_CONFIG_MAP, CanonicalAdaptersPEFTConfig, @@ -168,7 +173,11 @@ def _check_and_add_peft_cfg(self, peft_cfg): for adapter_name, adapter_cfg in peft_cfg.get_config_dict().items(): # self.mcore_gpt means is GPT and not T5 - if hasattr(self, 'mcore_gpt') and not isinstance(adapter_cfg, PromptEncoderAdapterConfig): + if ( + hasattr(self, 'mcore_gpt') + and not isinstance(adapter_cfg, PromptEncoderAdapterConfig) + and not isinstance(adapter_cfg, MLPHeadAdapterConfig) + ): if layer_selection is not None: logging.info( f"Layer selection {layer_selection} is enabled for the current model (" @@ -351,8 +360,10 @@ def load_adapters( assert filepath.endswith( '.nemo' ), "Inferring peft scheme is only supported for .nemo checkpoints. Please supply the `peft_cfgs` argument." - peft_cfgs = [PEFT_CONFIG_MAP[conf.peft.peft_scheme](conf)] + peft_cfg_cls_lst = [PEFT_CONFIG_MAP[s] for s in conf.peft.peft_scheme.split(",")] + peft_cfgs = [_peft_cfg(conf) for _peft_cfg in peft_cfg_cls_lst] if getattr(self, 'megatron_amp_O2', False): + state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()} self.add_adapter(peft_cfgs) if not self.ptuning_only_and_non_first_stage: diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py index 726ca33611d7..25f303fc22fb 100644 --- a/nemo/collections/nlp/parts/peft_config.py +++ b/nemo/collections/nlp/parts/peft_config.py @@ -24,6 +24,7 @@ MCoreMLPMixin, MCoreSelfAttentionMixin, MCoreSequentialMLPMixin, + MCoreTransformerBlockMixin, MCoreTransformerLayerMixin, ) except (ImportError, ModuleNotFoundError): @@ -41,6 +42,7 @@ LoraMoeHto4HAdapterConfig, LoraUnfusedHto4HAdapterConfig, LoraUnfusedKQVAdapterConfig, + MLPHeadAdapterConfig, MLPInfusedAdapterConfig, ParallelLinearAdapterConfig, ParallelLinearAdapterWeightTyingConfig, @@ -127,6 +129,21 @@ def __init__(self, cfg): self.tunable_base_param_names = selective_cfg.get("tunable_base_param_names", []) +class MLPHeadPEFTConfig(PEFTConfig): + def __init__(self, cfg): + config_args = {"in_features": cfg.hidden_size, "out_features": cfg.peft.mlp_head_tuning.out_features} + mlp_head_cfg = MLPHeadAdapterConfig(**config_args) + + name_key_to_cfg = { + AdapterName.MLP_HEAD_ADAPTER: mlp_head_cfg, + } + self.name_key_to_mcore_mixins = { + AdapterName.MLP_HEAD_ADAPTER: [("decoder", MCoreTransformerBlockMixin)], + } + + super().__init__(cfg.peft.mlp_head_tuning, name_key_to_cfg) + + class LoraPEFTConfig(PEFTConfig): def __init__(self, cfg): lora_cfg = cfg.peft.lora_tuning @@ -401,6 +418,7 @@ def __init__(self, cfg): "ia3": IA3PEFTConfig, "ptuning": PtuningPEFTConfig, "lora": LoraPEFTConfig, + "mlp_head": MLPHeadPEFTConfig, "qlora": QLoraPEFTConfig, "selective": SelectivePEFTConfig, 'none': None, From b4821e1a578e363a427ff0451edc89da6b6ae9f9 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Wed, 10 Jul 2024 19:37:37 +0300 Subject: [PATCH 011/173] unpin transformers version (#9606) * unpin transformers Signed-off-by: dimapihtar * guard deprecated imports Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * fix import guards Signed-off-by: dimapihtar * fix import guards Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * try fixing Signed-off-by: Chen Cui * disable HF tests Signed-off-by: Dmytro Pykhtar * try fixing Signed-off-by: Chen Cui * hard code model lists Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx * hard code model lists Signed-off-by: Chen Cui --------- Signed-off-by: dimapihtar Signed-off-by: dimapihtar Signed-off-by: Chen Cui Signed-off-by: Dmytro Pykhtar Signed-off-by: cuichenx Co-authored-by: dimapihtar Co-authored-by: Chen Cui Co-authored-by: Dmytro Pykhtar Co-authored-by: cuichenx --- .../common/huggingface/huggingface_utils.py | 82 +++++++++++++++++-- requirements/requirements_lightning.txt | 2 +- 2 files changed, 75 insertions(+), 9 deletions(-) diff --git a/nemo/collections/nlp/modules/common/huggingface/huggingface_utils.py b/nemo/collections/nlp/modules/common/huggingface/huggingface_utils.py index cf692e07749d..d8f6936f7126 100644 --- a/nemo/collections/nlp/modules/common/huggingface/huggingface_utils.py +++ b/nemo/collections/nlp/modules/common/huggingface/huggingface_utils.py @@ -16,12 +16,6 @@ from typing import List, Optional from transformers import ( - ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST, - BERT_PRETRAINED_MODEL_ARCHIVE_LIST, - CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST, - DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST, - GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, - ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, AlbertConfig, AutoModel, BertConfig, @@ -41,6 +35,74 @@ __all__ = ["get_huggingface_lm_model", "get_huggingface_pretrained_lm_models_list", "VOCAB_FILE_NAME"] +# Manually specify the model archive lists since these are now removed in HF +# https://github.com/huggingface/transformers/blob/v4.40-release/src/transformers/models/deprecated/_archive_maps.py +ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "albert/albert-base-v1", + "albert/albert-large-v1", + "albert/albert-xlarge-v1", + "albert/albert-xxlarge-v1", + "albert/albert-base-v2", + "albert/albert-large-v2", + "albert/albert-xlarge-v2", + "albert/albert-xxlarge-v2", +] + +BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "google-bert/bert-base-uncased", + "google-bert/bert-large-uncased", + "google-bert/bert-base-cased", + "google-bert/bert-large-cased", + "google-bert/bert-base-multilingual-uncased", + "google-bert/bert-base-multilingual-cased", + "google-bert/bert-base-chinese", + "google-bert/bert-base-german-cased", + "google-bert/bert-large-uncased-whole-word-masking", + "google-bert/bert-large-cased-whole-word-masking", + "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", + "google-bert/bert-large-cased-whole-word-masking-finetuned-squad", + "google-bert/bert-base-cased-finetuned-mrpc", + "google-bert/bert-base-german-dbmdz-cased", + "google-bert/bert-base-german-dbmdz-uncased", + "cl-tohoku/bert-base-japanese", + "cl-tohoku/bert-base-japanese-whole-word-masking", + "cl-tohoku/bert-base-japanese-char", + "cl-tohoku/bert-base-japanese-char-whole-word-masking", + "TurkuNLP/bert-base-finnish-cased-v1", + "TurkuNLP/bert-base-finnish-uncased-v1", + "wietsedv/bert-base-dutch-cased", +] +CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "almanach/camembert-base", + "Musixmatch/umberto-commoncrawl-cased-v1", + "Musixmatch/umberto-wikipedia-uncased-v1", +] + +DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "distilbert-base-uncased", + "distilbert-base-uncased-distilled-squad", + "distilbert-base-cased", + "distilbert-base-cased-distilled-squad", + "distilbert-base-german-cased", + "distilbert-base-multilingual-cased", + "distilbert-base-uncased-finetuned-sst-2-english", +] +GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "openai-community/gpt2", + "openai-community/gpt2-medium", + "openai-community/gpt2-large", + "openai-community/gpt2-xl", + "distilbert/distilgpt2", +] +ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "FacebookAI/roberta-base", + "FacebookAI/roberta-large", + "FacebookAI/roberta-large-mnli", + "distilbert/distilroberta-base", + "openai-community/roberta-base-openai-detector", + "openai-community/roberta-large-openai-detector", +] + HUGGINGFACE_MODELS = { "BertModel": { @@ -94,7 +156,9 @@ def get_huggingface_lm_model( - pretrained_model_name: str, config_dict: Optional[dict] = None, config_file: Optional[str] = None, + pretrained_model_name: str, + config_dict: Optional[dict] = None, + config_file: Optional[str] = None, ): """ Returns lm model instantiated with Huggingface @@ -135,7 +199,9 @@ def get_huggingface_lm_model( raise ValueError(f"Use HuggingFace API directly in NeMo for {pretrained_model_name}") -def get_huggingface_pretrained_lm_models_list(include_external: bool = False,) -> List[str]: +def get_huggingface_pretrained_lm_models_list( + include_external: bool = False, +) -> List[str]: """ Returns the list of pretrained HuggingFace language models diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt index c7e67d21a693..1b3397f69033 100644 --- a/requirements/requirements_lightning.txt +++ b/requirements/requirements_lightning.txt @@ -4,6 +4,6 @@ hydra-core>1.3,<=1.3.2 omegaconf<=2.3 pytorch-lightning>2.2.1 torchmetrics>=0.11.0 -transformers>=4.36.0,<=4.40.2 +transformers wandb webdataset>=0.2.86 From 14d42dc599ff3d948f1e1271b1890d5b8c5fbd77 Mon Sep 17 00:00:00 2001 From: Selvaraj Anandaraj Date: Wed, 10 Jul 2024 10:56:40 -0700 Subject: [PATCH 012/173] Added CPU offloading docs (#9479) * Added CPU offloading docs Signed-off-by: Selvaraj Anandaraj * Tech writer review Signed-off-by: Selvaraj Anandaraj --------- Signed-off-by: Selvaraj Anandaraj Co-authored-by: Selvaraj Anandaraj Co-authored-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> --- docs/source/features/memory_optimizations.rst | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/source/features/memory_optimizations.rst b/docs/source/features/memory_optimizations.rst index 4d363670fedf..1fe8215864a9 100644 --- a/docs/source/features/memory_optimizations.rst +++ b/docs/source/features/memory_optimizations.rst @@ -105,3 +105,24 @@ Implement MQA or GQA NeMo's support for GQA and MQA is enabled through the integration of Megatron Core's Attention mechanism. The underlying implementation details can be explored within the Attention class of Megatron Core, which provides the functional backbone for these advanced attention methods. To understand the specific modifications and implementations of MQA and GQA, refer to the source code in the Attention class: Check implementation details from Attention Class in Megatron Core Repo: https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/attention.py#L49 + + +CPU Offloading +-------------- + +Overview +^^^^^^^^ + +CPU Offloading in NeMo is a feature that reduces the peak memory usage of the GPU by offloading activations and inactive weights to CPU storage. NeMo supports offloading at the transformer layer level, allowing users to specify the number of transformer layers in their language model that require CPU offloading. During the forward pass, NeMo offloads activations at the optimal time and reloads them as needed during the backward pass. + +Features +^^^^^^^^ +> Supports training models with long sequence lengths by managing activation memory efficiently. +> Enables high batch sizes per GPU by offloading activation memory. +> Overlaps computation with data transfers (Host2Device and Device2Host) during offloading and reloading. + +Usage +^^^^^ +> Set cpu_offloading to True to enable CPU offloading. +> Set cpu_offloading_num_layers to a value between 0 and the total number of layers in the model minus one. +> Set cpu_offloading_activations and cpu_offloading_weights based on your needs to offload activations only, weights only, or both. From 3ab0a2a65f53dc580a75960e5897c33920087db3 Mon Sep 17 00:00:00 2001 From: Shashank Verma Date: Wed, 10 Jul 2024 16:10:45 -0700 Subject: [PATCH 013/173] Update llama-3 PEFT notebook to download model from NGC (#9667) * Update llama-3 PEFT notebook to download model from NGC Signed-off-by: Shashank Verma * Fix broken link in llama-3 PEFT tutorial README Signed-off-by: Shashank Verma * Fix broken code block in llama 3 PEFT tutorial README Signed-off-by: Shashank Verma * Copy-edits to Llama-3 8B PEFT tutorial README Signed-off-by: Shashank Verma * Fix broken link Signed-off-by: Shashank Verma * Minor formatting fixes Signed-off-by: Shashank Verma --------- Signed-off-by: Shashank Verma --- tutorials/llm/llama-3/README.rst | 54 ++++---- .../llm/llama-3/llama3-lora-nemofw.ipynb | 117 ++++-------------- 2 files changed, 47 insertions(+), 124 deletions(-) diff --git a/tutorials/llm/llama-3/README.rst b/tutorials/llm/llama-3/README.rst index 473815802e5f..663c0c99abfc 100755 --- a/tutorials/llm/llama-3/README.rst +++ b/tutorials/llm/llama-3/README.rst @@ -1,9 +1,9 @@ Llama 3 LoRA Fine-Tuning and Deployment with NeMo Framework and NVIDIA NIM ========================================================================== -`Llama 3 `_ is an open source large language model by Meta that delivers state-of-the-art performance on popular industry benchmarks. It has been pretrained on over 15 trillion tokens, and supports an 8K token context length. It is available in two sizes, 8B and 70B, and each size has two variants—base pretrained and instruction tuned. +`Llama 3 `_ is an open-source large language model by Meta that delivers state-of-the-art performance on popular industry benchmarks. It has been pretrained on over 15 trillion tokens, and supports an 8K token context length. It is available in two sizes, 8B and 70B, and each size has two variants—base pretrained and instruction tuned. -`Low-Rank Adaptation (LoRA) `__ has emerged as a popular Parameter Efficient Fine-Tuning (PEFT) technique that tunes a very small number of additional parameters as compared to full fine-tuning, thereby reducing the compute required. +`Low-Rank Adaptation (LoRA) `__ has emerged as a popular Parameter-Efficient Fine-Tuning (PEFT) technique that tunes a very small number of additional parameters as compared to full fine-tuning, thereby reducing the compute required. `NVIDIA NeMo Framework `__ provides tools to perform LoRA on Llama 3 to fit your use case, which can then be deployed using `NVIDIA NIM `__ for optimized inference on NVIDIA GPUs. @@ -16,21 +16,17 @@ Framework `__. +| NIM enables seamless deployment of multiple LoRA adapters (referred to as “multi-LoRA”) on the same base model. It dynamically loads the adapter weights based on incoming requests at runtime. This flexibility allows handling inputs from various tasks or use cases without deploying a unique model for each individual scenario. For further details, consult the `NIM documentation for LLMs `__. Requirements ------------- -In order to proceed, ensure that you have met the following requirements: - * System Configuration * Access to at least 1 NVIDIA GPU with a cumulative memory of at least 80GB, for example: 1 x H100-80GB or 1 x A100-80GB. * A Docker-enabled environment, with `NVIDIA Container Runtime `_ installed, which will make the container GPU-aware. * `Additional NIM requirements `_. -* Requested the necessary permission from Hugging Face and Meta to download `Meta-Llama-3-8B-Instruct `_. Then, you can use your Hugging Face `access token `_ to download the model, which we will then convert and customize with NeMo Framework. - -* `Authenticate with NVIDIA NGC `_, and download `NGC CLI Tool `_. +* `Authenticate with NVIDIA NGC `_, and download `NGC CLI Tool `_. You will use this tool to download the model and customize it with NeMo Framework. `Create a LoRA Adapter with NeMo Framework <./llama3-lora-nemofw.ipynb>`__ @@ -38,10 +34,16 @@ In order to proceed, ensure that you have met the following requirements: This notebook shows how to perform LoRA PEFT on **Llama 3 8B Instruct** using `PubMedQA `__ with NeMo Framework. PubMedQA is a Question-Answering dataset for biomedical texts. You will use the NeMo Framework which is available as a `docker container `__. -To get started -^^^^^^^^^^^^^^ +1. Download the `Llama 3 8B Instruct .nemo `__ from NVIDIA NGC using the NGC CLI. The following command saves the ``.nemo`` format model in a folder named ``llama-3-8b-instruct-nemo_v1.0`` in the current directory. You can specify another path using the ``-d`` option in the CLI tool. + +.. code:: bash + + ngc registry model download-version "nvidia/nemo/llama-3-8b-instruct-nemo:1.0" + + +Alternatively, you can download the model from `Hugging Face `__ and convert it to the ``.nemo`` format using the Hugging Face to NeMo `Llama checkpoint conversion script `__. If you'd like to skip this extra step, the ``.nemo`` model is available on NGC as mentioned above. -1. Run the container using the following command. It assumes that you have the notebook(s) available in the current working directory. If not, mount the appropriate folder to ``/workspace``. +2. Run the container using the following command. It is assumed that you have the notebook(s) and llama-3-8b-instruct model available in the current directory. If not, mount the appropriate folder to ``/workspace``. .. code:: bash @@ -61,13 +63,13 @@ To get started -v ${PWD}/results:/results \ nvcr.io/nvidia/nemo:$FW_VERSION bash -2. From within the container, start the Jupyter lab: +3. From within the container, start the Jupyter lab: .. code:: bash jupyter lab --ip 0.0.0.0 --port=8888 --allow-root -3. Then, navigate to `this notebook <./llama3-lora-nemofw.ipynb>`__. +4. Then, navigate to `this notebook <./llama3-lora-nemofw.ipynb>`__. `Deploy Multiple LoRA Inference Adapters with NVIDIA NIM <./llama3-lora-deploy-nim.ipynb>`__ @@ -100,15 +102,11 @@ The following steps assume that you have authenticated with NGC and downloaded t popd chmod -R 777 $LOCAL_PEFT_DIRECTORY -2. Prepare the LoRA model store +2. Prepare the LoRA model store. -After training is complete, that LoRA model checkpoint will be -created at -``./results/Meta-Llama-3-8B-Instruct/checkpoints/megatron_gpt_peft_lora_tuning.nemo``, -assuming default paths in the first notebook weren’t modified. +After training is complete, that LoRA model checkpoint will be created at ``./results/Meta-Llama-3-8B-Instruct/checkpoints/megatron_gpt_peft_lora_tuning.nemo``, assuming default paths in the first notebook weren’t modified. -To ensure model store is organized as expected, create a folder named -``llama3-8b-pubmed-qa``, and move your .nemo checkpoint there. +To ensure the model store is organized as expected, create a folder named ``llama3-8b-pubmed-qa``, and move your ``.nemo`` checkpoint there. .. code:: bash @@ -119,7 +117,7 @@ To ensure model store is organized as expected, create a folder named -The LoRA model store directory should have a structure like so - with the name of the model as a sub-folder that contains the .nemo file. +Ensure that the LoRA model store directory follows this structure: the model name(s) should be sub-folder(s) containing the ``.nemo`` file(s). :: @@ -131,11 +129,10 @@ The LoRA model store directory should have a structure like so - with the name o └── llama3-8b-pubmed-qa └── megatron_gpt_peft_lora_tuning.nemo -The last one was just trained on the PubmedQA dataset in the previous -notebook. +The last one was just trained on the PubmedQA dataset in the previous notebook. -3. Set-up NIM +3. Set-up NIM. From your host OS environment, start the NIM docker container while mounting the LoRA model store, as follows: @@ -167,12 +164,11 @@ From your host OS environment, start the NIM docker container while mounting the -p 8000:8000 \ nvcr.io/nim/meta/llama3-8b-instruct:1.0.0 -The first time you run the command, it will download the model and cache it in ``$NIM_CACHE_PATH`` so subsequent deployments are even faster. There are several options to configure NIM other than the ones listed above. You can find a full list in `NIM configuration `__ documentation. +The first time you run the command, it will download the model and cache it in ``$NIM_CACHE_PATH`` so subsequent deployments are even faster. There are several options to configure NIM other than the ones listed above. You can find a full list in the `NIM configuration `__ documentation. -4. Start the notebook +4. Start the notebook. -From another terminal, follow the same instructions as the previous -notebook to launch Jupyter Lab, and navigate to `this notebook <./llama3-lora-deploy-nim.ipynb>`__. +From another terminal, follow the same instructions as the previous notebook to launch Jupyter Lab, and then navigate to `this notebook <./llama3-lora-deploy-nim.ipynb>`__. -You can use the same NeMo Framework docker container which already has Jupyter Lab installed. \ No newline at end of file +You can use the same NeMo Framework docker container which has Jupyter Lab already installed. \ No newline at end of file diff --git a/tutorials/llm/llama-3/llama3-lora-nemofw.ipynb b/tutorials/llm/llama-3/llama3-lora-nemofw.ipynb index 3244bf18e818..bb30ece20a37 100755 --- a/tutorials/llm/llama-3/llama3-lora-nemofw.ipynb +++ b/tutorials/llm/llama-3/llama3-lora-nemofw.ipynb @@ -15,7 +15,7 @@ "source": [ "This notebook showcases performing LoRA PEFT **Llama 3 8B** on [PubMedQA](https://pubmedqa.github.io/) using NeMo Framework. PubMedQA is a Question-Answering dataset for biomedical texts.\n", "\n", - "> `NOTE:` Ensure that you run this notebook inside the [NeMo Framework container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) which has all the required dependencies. Instructions are available in the associated tutorial README." + "> `NOTE:` Ensure that you run this notebook inside the [NeMo Framework container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) which has all the required dependencies. **Instructions are available in the associated tutorial README to download the model and the container.**" ] }, { @@ -32,114 +32,41 @@ }, { "cell_type": "markdown", - "id": "deb6a910-a05e-4ae1-aac4-56e5092be2b4", - "metadata": { - "tags": [] - }, - "source": [ - "---\n", - "## Step-by-step instructions\n", - "\n", - "This notebook is structured into six steps:\n", - "1. Download Llama-3-8B-Instruct from Hugging Face\n", - "2. Convert Llama-3-8B-Instruct to NeMo format\n", - "3. Prepare the dataset\n", - "4. Run the PEFT finetuning script\n", - "5. Inference with NeMo Framework\n", - "6. Check the model accuracy\n" - ] - }, - { - "cell_type": "markdown", - "id": "e1f8f06d-aa9b-49cf-b50b-023967fc9e1a", + "id": "0b285d5a-d838-423b-9d6c-65add61f48ce", "metadata": {}, "source": [ - "### Step 1: Download the model from Hugging Face" - ] - }, - { - "cell_type": "markdown", - "id": "b5c50597-53e9-4604-9b86-af4c8e6b027e", - "metadata": {}, - "source": [ - "> `NOTE:` Access to Meta-Llama-3-8B-Instruct is gated. Before you proceed, ensure that you have a Hugging Face account, and have requested the necessary permission from Hugging Face and Meta to download the model on the [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) page. Then, you can use your Hugging Face [access token](https://huggingface.co/docs/hub/en/security-tokens) to download the model in the following code snippet, which we will then convert and customize with NeMo Framework." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f14a2ea5-309b-4f78-8524-313043e9daeb", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import os\n", - "import huggingface_hub\n", - "\n", - "# Set your Hugging Face access token\n", - "huggingface_hub.login(\"\")" + "---\n", + "## Before you begin\n", + "Ensure that you have the `Meta Llama3 8B Instruct .nemo` model downloaded and the corresponding folder mounted to the container." ] }, { "cell_type": "code", "execution_count": null, - "id": "99125f50", + "id": "3057e525-7957-45c0-bedc-c347d4811081", "metadata": { "tags": [] }, "outputs": [], "source": [ - "os.makedirs(\"./Meta-Llama-3-8B-Instruct\" ,exist_ok=True)\n", - "huggingface_hub.snapshot_download(repo_id=\"meta-llama/Meta-Llama-3-8B-Instruct\", local_dir=\"Meta-Llama-3-8B-Instruct\", local_dir_use_symlinks=False)" - ] - }, - { - "cell_type": "markdown", - "id": "18d5a8a9-41db-4186-a51a-a89d0501e1c0", - "metadata": {}, - "source": [ - "The Llama-3-8B-Instruct model will be downloaded to `./Meta-Llama-3-8B-Instruct`" + "!ls /workspace/llama-3-8b-instruct-nemo_v1.0" ] }, { "cell_type": "markdown", - "id": "49fc4629", - "metadata": {}, - "source": [ - "### Step 2: Convert Llama-3-8B-Instruct to NeMo format\n", - "\n", - "Run the below code to convert the model to the NeMo format. \n", - "\n", - "The generated `.nemo` file uses distributed checkpointing and can be loaded with any Tensor Parallel (TP) or Pipeline Parallel (PP) combination without reshaping or splitting. For more information on parallelisms in NeMo, refer to [NeMo Framework documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/parallelisms.html)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55331dd3", + "id": "deb6a910-a05e-4ae1-aac4-56e5092be2b4", "metadata": { "tags": [] }, - "outputs": [], "source": [ - "%%bash\n", - "\n", - "# clear any previous temporary weights dir if any\n", - "rm -r model_weights\n", + "---\n", + "## Step-by-step instructions\n", "\n", - "python /opt/NeMo/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \\\n", - " --precision bf16 \\\n", - " --input_name_or_path=./Meta-Llama-3-8B-Instruct/ \\\n", - " --output_path=./Meta-Llama-3-8B-Instruct.nemo" - ] - }, - { - "cell_type": "markdown", - "id": "fafb86d7-6254-42d4-b9aa-ab8a723f90c1", - "metadata": {}, - "source": [ - "This will create a .nemo model file in current working directory." + "This notebook is structured into four steps:\n", + "1. Prepare the dataset\n", + "2. Run the PEFT finetuning script\n", + "3. Inference with NeMo Framework\n", + "4. Check the model accuracy" ] }, { @@ -147,7 +74,7 @@ "id": "8ea5bd31", "metadata": {}, "source": [ - "### Step 3: Prepare the dataset\n", + "### Step 1: Prepare the dataset\n", "\n", "Download the PubMedQA dataset and run the pre-processing script in the cloned directory." ] @@ -288,7 +215,7 @@ "metadata": {}, "source": [ "\n", - "### Step 4: Run PEFT finetuning script for LoRA\n", + "### Step 2: Run PEFT finetuning script for LoRA\n", "\n", "NeMo framework includes a high level python script for fine-tuning [megatron_gpt_finetuning.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py) that can abstract away some of the lower level API calls. Once you have your model downloaded and the dataset ready, LoRA fine-tuning with NeMo is essentially just running this script!\n", "\n", @@ -309,7 +236,7 @@ "%%bash\n", "\n", "# Set paths to the model, train, validation and test sets.\n", - "MODEL=\"./Meta-Llama-3-8B-Instruct.nemo\"\n", + "MODEL=\"/workspace/llama-3-8b-instruct-nemo_v1.0/8b_instruct_nemo_bf16.nemo\"\n", "TRAIN_DS=\"[./pubmedqa/data/pubmedqa_train.jsonl]\"\n", "VALID_DS=\"[./pubmedqa/data/pubmedqa_val.jsonl]\"\n", "TEST_DS=\"[./pubmedqa/data/pubmedqa_test.jsonl]\"\n", @@ -377,7 +304,7 @@ "tags": [] }, "source": [ - "### Step 5: Inference with NeMo Framework\n", + "### Step 3: Inference with NeMo Framework\n", "\n", "Running text generation within the framework is also possible with running a Python script. Note that is more for testing and validation, not a full-fledged deployment solution like NVIDIA NIM." ] @@ -454,7 +381,7 @@ "id": "2fe048f9", "metadata": {}, "source": [ - "### Step 6: Check the model accuracy\n", + "### Step 4: Check the model accuracy\n", "\n", "Now that the results are in, let's read the results and calculate the accuracy on the pubmedQA task. You can compare your accuracy results with the public leaderboard at https://pubmedqa.github.io/.\n", "\n", @@ -565,8 +492,8 @@ "source": [ "For the Llama-3-8B-Instruct model, you should see accuracy comparable to the below:\n", "```\n", - "Accuracy 0.786000\n", - "Macro-F1 0.550305\n", + "Accuracy 0.792000\n", + "Macro-F1 0.594778\n", "```" ] } From 4e5174bde02369c002ffb9f4b1cfc7b7bab77174 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 10 Jul 2024 16:21:45 -0700 Subject: [PATCH 014/173] fix pipeline parallel dtype bug (#9637) (#9661) Signed-off-by: ashors1 Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com> Co-authored-by: Marc Romeyn Co-authored-by: ashors1 --- nemo/lightning/_strategy_lib.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py index e6452de16512..b38883b95643 100644 --- a/nemo/lightning/_strategy_lib.py +++ b/nemo/lightning/_strategy_lib.py @@ -136,6 +136,7 @@ def set_model_parallel_attributes(model, parallelism): config.expert_model_parallel_size = parallelism.expert_model_parallel_size config.moe_extended_tp = parallelism.moe_extended_tp config.sequence_parallel = parallelism.sequence_parallel + config.pipeline_dtype = parallelism.pipeline_dtype return config From 900ca0b8bac3d038b9854bd498967b0525aaeb35 Mon Sep 17 00:00:00 2001 From: Slyne Deng Date: Wed, 10 Jul 2024 18:24:24 -0700 Subject: [PATCH 015/173] LITA integration (#9578) * add lita Signed-off-by: Slyne Deng * Apply isort and black reformatting Signed-off-by: Slyne * add part of the tutorial and fix format Signed-off-by: slyne deng * add tutorial Signed-off-by: slyne deng * fix Tutorial ckpt conversion Signed-off-by: slyne deng * Apply isort and black reformatting Signed-off-by: Slyne * update cicd Signed-off-by: Slyne Deng * add to CIICD test Signed-off-by: Slyne Deng * changes based on review comments Signed-off-by: Slyne Deng * fix bot warning Signed-off-by: Slyne Deng * update cicd main Signed-off-by: Slyne Deng * fix cicd ckpt conversion Signed-off-by: Slyne Deng --------- Signed-off-by: Slyne Deng Signed-off-by: Slyne Signed-off-by: slyne deng Co-authored-by: Slyne Deng Co-authored-by: Slyne Co-authored-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> --- .github/workflows/cicd-main.yml | 24 +- .../multimodal_llm/neva/conf/lita_config.yaml | 242 +++++++ .../multimodal_llm/neva/conf/vita_config.yaml | 231 +++++++ ...va_to_neva.py => convert_llava_to_neva.py} | 142 +++- .../neva/eval/eval_video_rtl.py | 196 ++++++ .../multimodal_llm/neva/eval/eval_vqa.py | 207 ++++++ .../multimodal_llm/neva/neva_evaluation.py | 202 ++++-- .../multimodal/data/neva/conversation.py | 4 + .../multimodal/data/neva/neva_dataset.py | 105 ++- .../models/multimodal_llm/neva/neva_model.py | 175 ++++- nemo/collections/multimodal/parts/utils.py | 31 +- .../common/text_generation_strategy.py | 17 + .../modules/common/text_generation_utils.py | 95 ++- .../convert_dvc_dataset_for_evaluation.py | 160 +++++ .../convert_dvc_dataset_for_training.py | 322 +++++++++ .../convert_video_qa_dataset.py | 184 ++++++ .../generate_qa_data.py | 369 +++++++++++ .../prepare_youmakeup.py | 325 +++++++++ tutorials/multimodal/LITA Tutorial.ipynb | 621 ++++++++++++++++++ tutorials/multimodal/NeVA Tutorial.ipynb | 4 +- tutorials/multimodal/README.md | 1 + tutorials/multimodal/images/LITA_arch.png | Bin 0 -> 268131 bytes 22 files changed, 3547 insertions(+), 110 deletions(-) create mode 100644 examples/multimodal/multimodal_llm/neva/conf/lita_config.yaml create mode 100644 examples/multimodal/multimodal_llm/neva/conf/vita_config.yaml rename examples/multimodal/multimodal_llm/neva/{convert_hf_llava_to_neva.py => convert_llava_to_neva.py} (73%) create mode 100644 examples/multimodal/multimodal_llm/neva/eval/eval_video_rtl.py create mode 100644 examples/multimodal/multimodal_llm/neva/eval/eval_vqa.py create mode 100644 scripts/multimodal_dataset_conversion/convert_dvc_dataset_for_evaluation.py create mode 100644 scripts/multimodal_dataset_conversion/convert_dvc_dataset_for_training.py create mode 100644 scripts/multimodal_dataset_conversion/convert_video_qa_dataset.py create mode 100644 scripts/multimodal_dataset_conversion/generate_qa_data.py create mode 100644 scripts/multimodal_dataset_conversion/prepare_youmakeup.py create mode 100644 tutorials/multimodal/LITA Tutorial.ipynb create mode 100644 tutorials/multimodal/images/LITA_arch.png diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 10cd8d1e6561..102b4a30f39e 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -179,7 +179,28 @@ jobs: rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo AFTER_SCRIPT: | rm -rf /home/TestData/nlp/megatron_gpt/falcon-ci-hf/model_weights - + + # L2: Community llava multimodal Checkpoints tests + L2_Community_vita_Checkpoints_tests_Llama3: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + export PYTHONPATH=/home/TestData/multimodal/video_neva/LLaVA:$PYTHONPATH + CUDA_VISIBLE_DEVICES=0 python examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py \ + --in-file /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/llm \ + --mm-projector-ckpt-dir /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/mm_projector \ + --mm-vision-tower /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/vision_tower \ + --tokenizer-model /home/TestData/multimodal/video_neva/vita-tokenizer/ \ + --config-file vita_config.yaml \ + --out-file=/home/TestData/multimodal/video_neva/llama3-ci-hf/llama3_ci.nemo \ + --model-type VITA \ + --conv-template llama_3 + AFTER_SCRIPT: | + rm -f /home/TestData/multimodal/video_neva/llama3-ci-hf/llama3_ci.nemo + rm -rf /home/TestData/multimodal/video_neva/llama3-ci-hf/model_weights + # this test is using a 7B model which is too large for GitHub CI # replace the model in this test with a toy model or move the test # to the nightly CI @@ -4535,6 +4556,7 @@ jobs: - L2_Community_LLM_Checkpoints_tests_Llama - L2_Community_LLM_Checkpoints_tests_StarCoder - L2_Community_LLM_Checkpoints_tests_Falcon + - L2_Community_vita_Checkpoints_tests_Llama3 #- OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2 - ASR_dev_run_Speech_to_Text - ASR_dev_run_Speech_to_Text_WPE_-_CitriNet diff --git a/examples/multimodal/multimodal_llm/neva/conf/lita_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/lita_config.yaml new file mode 100644 index 000000000000..591f528810fc --- /dev/null +++ b/examples/multimodal/multimodal_llm/neva/conf/lita_config.yaml @@ -0,0 +1,242 @@ +name: nemo_video_lita_neva +restore_from_path: null # used when starting from a .nemo file + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. + max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 + val_check_interval: 100 + check_val_every_n_epoch: null + limit_val_batches: 50 + limit_test_batches: 500 + accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models + gradient_clip_val: 1.0 + benchmark: False + enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: nemo_video_neva_lita + create_wandb_logger: True + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + resume_from_checkpoint: ${model.resume_from_checkpoint} + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 5 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits + filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + ema: + enable: False + decay: 0.9999 + validate_original_weights: False + every_n_steps: 1 + cpu_offload: False + +model: + precision: ${trainer.precision} + + # specify micro_batch_size, global_batch_size, and model parallelism + # gradient accumulation will be done automatically based on data_parallel_size + + # Batch size guideline for different types of dataset + micro_batch_size: 1 # limited by GPU memory + global_batch_size: 2 # will use more micro batches to reach global batch size + + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + context_parallel_size: 1 # kqv model parallelism + virtual_pipeline_model_parallel_size: null # interleaved pipeline + + restore_from_path: null # used in fine-tuning + + # Multimodal configs + mm_cfg: + llm: + from_pretrained: null #path to nemo checkpoint + freeze: False + model_type: llama_2 # `nvgpt` or `llama_2` supported + vision_encoder: + from_pretrained: "Lin-Chen/ShareGPT4V-13B_Pretrained_vit-large336-l12" # huggingface path or name + from_hf: True + crop_size: [336, 336] + patch_dim: 14 + hidden_size: 1024 # could be found from model but tricky in code + vision_select_layer: -2 # default to the last layer + class_token_length: 1 + freeze: True + lita: + lita_video_arch: 'temporal_all_resolution' # ['temporal_spatial_pool', 'temporal_spatial', 'temporal_all_resolution'] 'temporal_spatial_pool' is used in lita1.0 + visual_token_format: 'im_vid_start_end' # ["v1", "im_vid_start_end"] v1 means do nothing, im_vid_start_end means add image and video start and end tokens around spatial and temporal tokens + sample_frames: 4 # for lita 1.5 sample_frames are used for spatial tokens, and spatial tokens will no longer do pooling and instead, it will use full tokens + use_lita: True + pretrain_mm_mlp_adapter: null # path to pretrained mm adapter + mm_mlp_adapter_type: mlp2x_gelu # ['linear', 'mlp2x_gelu', 'mlp_downsample'] + use_im_start_end: False + + # ========LORA configs start======= + #peft: + # peft_scheme: "lora" + # restore_from_path: null + # lora_tuning: + # adapter_dim: 128 + # alpha: 256 + # target_modules: ['all'] + # adapter_dropout: 0.0 + # column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + # row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + # layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + # weight_tying: False + # position_embedding_strategy: null # used only when weight_tying is True + # =======LORA configs end======= + + # LLM configs + # use GPTModel from megatron.core + mcore_gpt: True + + # model architecture + encoder_seq_length: 4096 + max_position_embeddings: ${.encoder_seq_length} + position_embedding_type: rope + num_layers: 32 + hidden_size: 4096 + ffn_hidden_size: 11008 # Transformer FFN hidden size. Usually 4 * hidden_size. + num_attention_heads: 32 + init_method_std: 0.014 # Standard deviation of the zero mean normal distribution used for weight initialization.') + use_scaled_init_method: True # use scaled residuals initialization + hidden_dropout: 0.0 # Dropout probability for hidden state transformer. + attention_dropout: 0.0 # Dropout probability for attention + ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' + layernorm_epsilon: 1e-5 + do_layer_norm_weight_decay: False # True means weight decay on all params + make_vocab_size_divisible_by: 16 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + persist_layer_norm: True # Use of persistent fused layer norm kernel. + bias: False # Whether to use bias terms in all weight matrices. + activation: 'fast-swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] + headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. + transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] + normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. + rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this. + attention_type: 'multihead' # Attention type. Options ['multihead'] + share_embeddings_and_output_weights: False # Share embedding and output layer weights. + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595. + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + + ## Activation Checkpointing + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + activations_checkpoint_num_layers: null # not used with 'selective' + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: False + + # precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + hysteresis: 2 # Gradient scale hysteresis + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # model fusions + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition. + + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism. + openai_gelu: False + bias_activation_fusion: False + megatron_legacy: False + + transformer_engine: True + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history + use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + + # Megatron O2-style half-precision + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + async_grad_allreduce: False + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce + + # miscellaneous + seed: 1234 + resume_from_checkpoint: null # manually set the checkpoint file to load from + apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + + tokenizer: + library: 'sentencepiece' + type: null + model: /ws/converted_nemo_model/tokenizer_1_5.model + vocab_file: null + merge_file: null + delimiter: null # only used for tabular tokenizer + sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers. + additional_special_tokens: null # ["", "", "", "", "", ""] + + data: + packed_sequence: False + num_workers: 8 + dataloader_type: cyclic + data_path: null + lazy_preprocess: True + is_multimodal: True + media_type: video # currently supported: image or video + splice_single_frame: null # 'first', 'middle', 'last' will represent video as first / middle / last frame only, all other frames discarded. + num_frames: 256 # selects the number of frames to use from the video + sep_token_between_frames: False # TODO: allow usage of separator tokens between frames + sep_image_conv_front: False + image_token_len: 576 #lita 1.0 uses 256 + conv_template: v1 # check `nemo/collections/multimodal/data/neva/conversation.py` + image_folder: null + video_folder: null + image_aspect_ratio: 'pad' # lita 1.0 uses 'square' + + # Nsys profiling options + nsys_profile: + enabled: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [ 0 ] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + + optim: + name: fused_adam + lr: 2e-5 + weight_decay: 0. + betas: + - 0.9 + - 0.95 + sched: + name: CosineAnnealing + warmup_steps: 140 + constant_steps: 0 + min_lr: 2e-7 diff --git a/examples/multimodal/multimodal_llm/neva/conf/vita_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/vita_config.yaml new file mode 100644 index 000000000000..7be99308a280 --- /dev/null +++ b/examples/multimodal/multimodal_llm/neva/conf/vita_config.yaml @@ -0,0 +1,231 @@ +name: nemo_video_lita_neva +restore_from_path: null # used when starting from a .nemo file + +trainer: + devices: 8 + num_nodes: 1 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. + max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 + val_check_interval: 100 + check_val_every_n_epoch: null + limit_val_batches: 50 + limit_test_batches: 500 + accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models + gradient_clip_val: 1.0 + benchmark: False + enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: nemo_video_neva_lita + create_wandb_logger: True + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + resume_from_checkpoint: ${model.resume_from_checkpoint} + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 5 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits + filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + ema: + enable: False + decay: 0.9999 + validate_original_weights: False + every_n_steps: 1 + cpu_offload: False + +model: + precision: ${trainer.precision} + + # specify micro_batch_size, global_batch_size, and model parallelism + # gradient accumulation will be done automatically based on data_parallel_size + + # Batch size guideline for different types of dataset + micro_batch_size: 1 # limited by GPU memory + global_batch_size: 128 # will use more micro batches to reach global batch size + + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + context_parallel_size: 1 # kqv model parallelism + virtual_pipeline_model_parallel_size: null # interleaved pipeline + + restore_from_path: null # used in fine-tuning + + # Multimodal configs + mm_cfg: + llm: + from_pretrained: null #path to nemo checkpoint + freeze: False + model_type: vita + vision_encoder: + from_pretrained: null # path or name + model_type: null + from_hf: True + crop_size: [384, 384] + patch_dim: 14 + hidden_size: 1152 # could be found from model but tricky in code + vision_select_layer: -2 # default to the last layer + vision_select_feature: 'cls_patch' # default is patch + class_token_length: 1 + freeze: True + lita: + lita_video_arch: 'temporal_all_resolution' # ['temporal_spatial_pool', 'temporal_spatial', 'temporal_all_resolution'] + visual_token_format: 'im_vid_start_end' # ["v1", "im_vid_start_end"] v1 means do nothing, im_vid_start_end means add image and video start and end tokens around spatial and temporal tokens + sample_frames: 4 # for lita 1.5 sample_frames are used for spatial tokens, and spatial tokens will no longer do pooling and instead, it will use full tokens + use_lita: True + pretrain_mm_mlp_adapter: null # path to pretrained mm adapter + mm_mlp_adapter_type: mlp_downsample # ['linear', 'mlp2x_gelu', 'mlp_downsample'] + + use_im_start_end: False + + + # LLM configs + # use GPTModel from megatron.core + mcore_gpt: True + + # model architecture + encoder_seq_length: 8192 + max_position_embeddings: ${.encoder_seq_length} + position_embedding_type: rope + num_layers: 32 + hidden_size: 4096 + ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size. + num_attention_heads: 32 + init_method_std: 0.014 # Standard deviation of the zero mean normal distribution used for weight initialization.') + use_scaled_init_method: True # use scaled residuals initialization + hidden_dropout: 0.0 # Dropout probability for hidden state transformer. + attention_dropout: 0.0 # Dropout probability for attention + ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' + layernorm_epsilon: 1e-5 + do_layer_norm_weight_decay: False # True means weight decay on all params + make_vocab_size_divisible_by: 16 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + persist_layer_norm: True # Use of persistent fused layer norm kernel. + bias: False # Whether to use bias terms in all weight matrices. + activation: 'fast-swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] + headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. + transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] + normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. + rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this. + rotary_base: 500000.0 # default is 10000 + attention_type: 'multihead' # Attention type. Options ['multihead'] + share_embeddings_and_output_weights: False # Share embedding and output layer weights. + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595. + num_query_groups: 8 # Number of query groups for group query attention. If None, normal attention is used. + + ## Activation Checkpointing + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + activations_checkpoint_num_layers: null # not used with 'selective' + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: False + + # precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + hysteresis: 2 # Gradient scale hysteresis + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # model fusions + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition. + + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism. + openai_gelu: False + bias_activation_fusion: False + megatron_legacy: False + + transformer_engine: True + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history + use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + + # Megatron O2-style half-precision + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + async_grad_allreduce: False + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce + + # miscellaneous + seed: 1234 + resume_from_checkpoint: null # manually set the checkpoint file to load from + apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + + tokenizer: + library: 'huggingface' + type: /ws/converted_models/tokenizer # set huggingface tokenizer here; And check `LITA Tutorial.ipynb` for how to add time tokens to tokenizer + model: null # set sentencepiece model path here if tokenizer is sentencepiece + vocab_file: null + merge_file: null + delimiter: null # only used for tabular tokenizer + sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers. + additional_special_tokens: null # ["", "", "", "", "", ""] + + data: + packed_sequence: False + num_workers: 8 + dataloader_type: cyclic + data_path: null + lazy_preprocess: True + is_multimodal: True + media_type: video # currently supported: image or video + splice_single_frame: null # 'first', 'middle', 'last' will represent video as first / middle / last frame only, all other frames discarded. + num_frames: 256 # selects the number of frames to use from the video + sep_token_between_frames: False # TODO: allow usage of separator tokens between frames + sep_image_conv_front: False + image_token_len: 784 # 28x28 + conv_template: llama_3 # check `nemo/collections/multimodal/data/neva/conversation.py` + image_folder: null + video_folder: null + image_aspect_ratio: 'pad' # in vila, it's `resize` + + # Nsys profiling options + nsys_profile: + enabled: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [ 0 ] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + + optim: + name: fused_adam + lr: 2e-5 + weight_decay: 0. + betas: + - 0.9 + - 0.95 + sched: + name: CosineAnnealing + warmup_steps: 140 + constant_steps: 0 + min_lr: 2e-7 diff --git a/examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py b/examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py similarity index 73% rename from examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py rename to examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py index 2cbb4c2b3b82..d02b737c750a 100644 --- a/examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py +++ b/examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py @@ -13,15 +13,22 @@ # limitations under the License. r""" -Script to convert HuggingFace LLaVA checkpoints into .nemo file. - Example to run this conversion script: - python convert_hf_llava_to_neva.py \ - --in-file \ - --out-file \ - --tokenizer-model \ - --conv-template llama_2 # nvgpt, llama_2, v1 (vicuna) +Script to convert LLaVA checkpoints into .nemo file. +This script depend on llava github project: +https://github.com/haotian-liu/LLaVA/tree/main + +If you want to convert huggingface LLaVA checkpoint such as llava-hf/llava-1.5-7b-hf, +you should check `NeMo/scripts/checkpoint_converters/convert_llava_hf_to_nemo.py` + +Example to run this conversion script: + python convert_hf_llava_to_neva.py \ + --in-file \ + --out-file \ + --tokenizer-model \ + --conv-template llama_2 # nvgpt, llama_2, v1, llama_3 (vicuna) """ +import json import os from argparse import ArgumentParser from collections import OrderedDict @@ -31,6 +38,7 @@ from omegaconf import OmegaConf from pytorch_lightning.core.saving import _load_state as ptl_load_state from pytorch_lightning.trainer.trainer import Trainer +from safetensors import safe_open from transformers import LlamaTokenizer from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel @@ -47,7 +55,11 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--in-file", type=str, default=None, required=True, help="Path to Huggingface LLaMA checkpoints", + "--in-file", + type=str, + default=None, + required=True, + help="Path to LLaVA checkpoints", ) parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.") parser.add_argument( @@ -61,6 +73,16 @@ def get_args(): "--tokenizer-model", type=str, default=None, required=False, help="Path to sentencepiece tokenizer model." ) parser.add_argument("--precision", type=str, default="32", help="Model precision") + parser.add_argument("--config-file", type=str, default="llava_config.yaml") + parser.add_argument( + "--mm-projector-ckpt-dir", + type=str, + default=None, + help="Path to multimodal projector checkpoint directory \ + This will overlap the projector weights in in-file hf checkpoint", + ) + parser.add_argument("--mm-vision-tower", type=str, default=None) + parser.add_argument("--model-type", type=str, default=None) args = parser.parse_args() return args @@ -110,13 +132,32 @@ def load_model(cls, checkpoint, strict, **kwargs): def load_config(args, llava_config): - nemo_config = OmegaConf.load(os.path.join(os.path.dirname(__file__), 'conf/llava_config.yaml')).model + nemo_config = OmegaConf.load(os.path.join(os.path.dirname(__file__), 'conf', args.config_file)).model nemo_config.mm_cfg.mm_mlp_adapter_type = llava_config.get('mm_projector_type', 'linear') - nemo_config.mm_cfg.vision_encoder.from_pretrained = llava_config.get( - 'mm_vision_tower', 'openai/clip-vit-large-patch14' - ) - if '336' in nemo_config.mm_cfg.vision_encoder.from_pretrained: - nemo_config.data.image_token_len = 576 + + mm_vision_tower = llava_config.get('mm_vision_tower', 'openai/clip-vit-large-patch14') + + if args.mm_vision_tower is not None: + mm_vision_tower = args.mm_vision_tower + + nemo_config.mm_cfg.vision_encoder.from_pretrained = mm_vision_tower + if args.mm_vision_tower is not None: + config_file = os.path.join(args.mm_vision_tower, "config.json") + if os.path.exists(config_file): + with open(config_file, "r") as f: + vision_model_config = json.load(f) + nemo_config.mm_cfg.vision_encoder["model_type"] = vision_model_config.get("model_type", 'clip') + crop_size = vision_model_config.get("image_size", 224) + nemo_config.mm_cfg.vision_encoder.crop_size = [crop_size, crop_size] + else: + if '336' in mm_vision_tower: + nemo_config.data.image_token_len = 576 + nemo_config.mm_cfg.vision_encoder.crop_size = [336, 336] + else: + nemo_config.data.image_token_len = 256 + nemo_config.mm_cfg.vision_encoder.crop_size = [224, 224] + nemo_config.mm_cfg.vision_encoder.patch_dim = 14 + nemo_config.encoder_seq_length = llava_config['max_position_embeddings'] nemo_config.num_layers = int(llava_config['num_hidden_layers']) nemo_config.hidden_size = llava_config['hidden_size'] @@ -130,16 +171,34 @@ def load_config(args, llava_config): nemo_config.use_cpu_initialization = True nemo_config.activation = 'fast-swiglu' nemo_config.data.conv_template = args.conv_template - nemo_config.mm_cfg.model_type = args.conv_template + nemo_config.data.image_aspect_ratio = llava_config.get('image_aspect_ratio', 'square') + if args.model_type is None: + nemo_config.mm_cfg.model_type = args.conv_template + else: + nemo_config.mm_cfg.model_type = args.model_type if args.tokenizer_model is None: - nemo_config.tokenizer.model = llava_config['tokenizer_model'] + if 'tokenizer_model' in llava_config: + nemo_config.tokenizer.library = 'sentencepiece' + nemo_config.tokenizer.model = llava_config['tokenizer_model'] + else: + # Llama3 uses converted TikToken Tokenizer + tokenizer_dict = {'library': 'huggingface', 'type': args.in_file, 'use_fast': True, 'model': None} + nemo_config.tokenizer.update(tokenizer_dict) else: - nemo_config.tokenizer.model = args.tokenizer_model + # if tokenizer_model is directory + if os.path.isdir(args.tokenizer_model): + tokenizer_dict = {'library': 'huggingface', 'type': args.tokenizer_model, 'use_fast': True, 'model': None} + nemo_config.tokenizer.update(tokenizer_dict) + else: + nemo_config.tokenizer.library = 'sentencepiece' + nemo_config.tokenizer.model = args.tokenizer_model if llava_config['rope_scaling'] is not None: if llava_config['rope_scaling']['type'] == 'linear': nemo_config['seq_len_interpolation_factor'] = llava_config['rope_scaling']['factor'] else: raise ValueError("Only linear rope scaling type is supported now") + if llava_config.get('rope_theta', None): + nemo_config['rotary_base'] = llava_config['rope_theta'] base = 128 while llava_config['vocab_size'] % base != 0: @@ -152,16 +211,15 @@ def load_config(args, llava_config): def convert(args): logging.info(f"loading checkpoint {args.in_file}") model = LlavaLlamaForCausalLM.from_pretrained(args.in_file) - tokenizer = LlamaTokenizer.from_pretrained(args.in_file) hf_config = vars(model.config) - hf_config['tokenizer_model'] = str(tokenizer.vocab_file) - print(f"hf_config: {hf_config}") - print("named parameters:") + if os.path.exists(f'{args.in_file}/tokenizer.model'): + tokenizer = LlamaTokenizer.from_pretrained(args.in_file) + hf_config['tokenizer_model'] = str(tokenizer.vocab_file) + for name, param in model.named_parameters(): print(f"- {name}") nemo_config = load_config(args, hf_config) - print(nemo_config) if args.precision in ["32", "16"]: precision = int(float(args.precision)) @@ -179,7 +237,7 @@ def convert(args): scaler = None if precision in [16, '16', '16-mixed']: scaler = GradScaler( - init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32), + init_scale=nemo_config.get('native_amp_init_scale', 2**32), growth_interval=nemo_config.get('native_amp_growth_interval', 1000), hysteresis=nemo_config.get('hysteresis', 2), ) @@ -235,10 +293,42 @@ def convert(args): for key in model.state_dict(): if 'mm_projector' in key: mm_projection_layer_suffix = key.split('mm_projector')[1] - checkpoint['state_dict'][ - f'{mm_projection_layer_base_name}{mm_projection_layer_suffix}' - ] = param_to_weights(model.state_dict()[key]) + checkpoint['state_dict'][f'{mm_projection_layer_base_name}{mm_projection_layer_suffix}'] = ( + param_to_weights(model.state_dict()[key]) + ) + # Replace or add the projection weights + proj_ckpt = None + if args.mm_projector_ckpt_dir is not None: + if os.path.exists(args.mm_projector_ckpt_dir): + ckpt_path = os.path.join(args.mm_projector_ckpt_dir, "mm_projector.bin") + if os.path.exists(ckpt_path): + proj_ckpt = torch.load(ckpt_path) + else: + ckpt_path = os.path.join(args.mm_projector_ckpt_dir, "model.safetensors") + proj_ckpt = {} + with safe_open(ckpt_path, framework="pt", device="cuda") as f: + for key in f.keys(): + new_key = key.replace("layers.", "mm_projector.") + proj_ckpt[new_key] = f.get_tensor(key) + else: + raise FileNotFoundError(f"mm_projector_ckpt_dir {args.mm_projector_ckpt_dir} does not exist.") + for key in proj_ckpt.keys(): + if 'mm_projector' in key: + mm_projection_layer_suffix = key.split('mm_projector')[1] + checkpoint['state_dict'][f'{mm_projection_layer_base_name}{mm_projection_layer_suffix}'] = ( + param_to_weights(proj_ckpt[key]) + ) + + proj_conf_file = open(os.path.join(args.mm_projector_ckpt_dir, "config.json")) + + proj_conf = json.load(proj_conf_file) + if proj_conf['mm_projector_type'] != nemo_config.mm_cfg.mm_mlp_adapter_type: + logging.warning( + f"Overriding mm_projector_type from {nemo_config.mm_cfg.mm_mlp_adapter_type} to {proj_conf['mm_projector_type']}" + ) + nemo_config.mm_cfg.mm_mlp_adapter_type = proj_conf['mm_projector_type'] + proj_conf_file.close() embed_weight = model.state_dict()[f'model.embed_tokens.weight'] if mcore_gpt: embed_weights_base_name = f'model.embedding.word_embeddings.weight' diff --git a/examples/multimodal/multimodal_llm/neva/eval/eval_video_rtl.py b/examples/multimodal/multimodal_llm/neva/eval/eval_video_rtl.py new file mode 100644 index 000000000000..3567cf431d87 --- /dev/null +++ b/examples/multimodal/multimodal_llm/neva/eval/eval_video_rtl.py @@ -0,0 +1,196 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This script is used for evaluating RTL (Reasoning Temporal Localization) task. +It accepts one JSON file. The JSON file should have the following structure: +[ + { + "video": "rY7eLyJF31M_6.mp4", + "question_id": "rY7eLyJF31M_6_0", + "question": "When is \"Apply mascara , false lashes on the lashes \" depicted in the video? Convey your answer using start and end timestamps exclusively.", + "ref_answer": "<0> <53> Apply mascara , false lashes on the lashes ", + "duration": 102.002002002002, + "pred_answer": "<1> <53> Apply mascara , false lashes on the lashes ", + }, + { + "video": "rY7eLyJF31M_6.mp4", + "question_id": "rY7eLyJF31M_6_1", + "question": "When is \"Apply foundation on the face with a brush\" depicted in the video? Provide a response using only start and end timestamps.", + "ref_answer": "<56> <97> Apply foundation on the face with a brush", + "duration": 102.002002002002, + "pred_answer": "<50> <97> Apply foundation on the face with a brush", + }, +] + +The `xxx_answer` field should contain the start and end timestamps such as `<56>` and `<97>` of the event along with the sentence. +If not, the [0, duration] will be used as the predicted timestamps. + +USAGE: +python eval_rtl.py --input_file \ + --output_dir \ + --save_mid_result +""" +import argparse +import json +import os +import re +from collections import defaultdict + + +def iou(seg1, seg2): + """Compute the intersection over union (IoU) between two segments. + + Args: + seg1 (list): [start, end] + seg2 (list): [start, end] + + Returns: + float: IoU value + """ + assert seg1[1] >= seg1[0] and seg2[1] >= seg2[0] + + x1 = max(seg1[0], seg2[0]) + x2 = min(seg1[1], seg2[1]) + inter = max(x2 - x1, 0) + + len1 = max(seg1[1] - seg1[0], 0) + len2 = max(seg2[1] - seg2[0], 0) + + union = len1 + len2 - inter + + if union == 0: + return 0.0 + else: + return inter / union + + +def precision_func(thres): + """calculate the precision based on the threshold. + If the IoU value is greater than or equal to the threshold, \ + the precision is 1.0, otherwise 0.0. + + Args: + thres (float): threshold value [0.0, 1.0] + """ + + def precision(seg1, seg2): + return float(iou(seg1, seg2) >= thres) + + return precision + + +def parse_start_end_timestamps(outputs, duration, strict=False): + timestamp_pattern = '\<(?: (?: \d* \.? \d+ ) | (?: \d+ \.? ) )\>' + rx = re.compile(timestamp_pattern, re.VERBOSE) + matches = list(rx.finditer(outputs)) + if strict: + assert len(list(matches)) >= 2, "cannot find timestamps" + elif len(list(matches)) < 2: + return outputs, [0, duration] + + prev_end = 0 + sentence = "" + timestamps = [] + for i in range(2): + m = matches[i] + start = m.start(0) + end = m.end(0) + timestamp = float(m.group(0)[1:-1]) + timestamp = min(max(timestamp, 0), duration) + timestamps.append(timestamp) + sentence += outputs[prev_end:start] + prev_end = end + sentence += outputs[prev_end:] + sentence = sentence.strip() + + return sentence, [min(timestamps), max(timestamps)] + + +def eval(pred_file, output_dir, save_mid_result=True): + """Evaluate the predictions against the ground truth. + + Args: + pred_file (str): path to the predictions JSON file + output_dir (str): path to the output directory, + where the `answers.json` and `metrics.json` result will be saved. + """ + metric_func = {'iou': iou, 'precision@0.5': precision_func(0.5)} + metrics = {} + for metric in metric_func: + metrics[metric] = defaultdict(list) + + with open(pred_file, 'r') as f: + pred_data = json.load(f) + + out_list = [] + for pred in pred_data: + assert "pred_answer" in pred, "pred_answer field is missing" + assert "ref_answer" in pred, "answer field is missing" + duration = pred['duration'] + pred_answer, pred_timestamps = parse_start_end_timestamps(pred['pred_answer'], duration, strict=False) + ref_answer, ref_timestamps = parse_start_end_timestamps(pred['ref_answer'], duration, strict=False) + + for metric in metric_func: + metrics[metric][pred['video']].append(metric_func[metric](pred_timestamps, ref_timestamps)) + + out_list.append( + { + 'video': pred['video'], + 'question_id': pred['question_id'], + 'question': pred['question'], + 'pred_answer': pred_answer, + 'ref_answer': ref_answer, + 'pred_timestamps': pred_timestamps, + 'ref_timestamps': ref_timestamps, + } + ) + # save result + os.makedirs(output_dir, exist_ok=True) + if save_mid_result: + output_file = os.path.join(output_dir, 'answers.json') + print(f"Saving intermediate result to {output_file}") + with open(output_file, 'w') as f: + json.dump(out_list, f, indent=2) + + final_result = {} + for metric in metrics: + values = [] + for vid in metrics[metric]: + # get single video metric value + cur_metric_values = metrics[metric][vid] + values.append(sum(cur_metric_values) / len(cur_metric_values)) + # get global average video metric value + values = sum(values) / len(values) + final_result[metric] = values + + print(final_result) + output_file = os.path.join(output_dir, 'metrics.json') + with open(output_file, 'w') as f: + json.dump(final_result, f, indent=2) + + +def main(): + parser = argparse.ArgumentParser(description="Evaluate the predictions against the ground truth") + parser.add_argument("--input_file", help="Path to the input JSON file", required=True) + parser.add_argument("--output_dir", help="Path to the output directory", required=True) + parser.add_argument("--save_mid_result", action="store_true", help="Save intermediate result") + args = parser.parse_args() + + eval(args.input_file, args.output_dir, args.save_mid_result) + + +if __name__ == "__main__": + main() diff --git a/examples/multimodal/multimodal_llm/neva/eval/eval_vqa.py b/examples/multimodal/multimodal_llm/neva/eval/eval_vqa.py new file mode 100644 index 000000000000..8929648a3f97 --- /dev/null +++ b/examples/multimodal/multimodal_llm/neva/eval/eval_vqa.py @@ -0,0 +1,207 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This script is used for evaluating Video Question Answering task by leveraging LLM API as a judge. +It accepts one JSON file. The JSON file should have the following structure: +[ + { + "video": "YRvBOLRgZNc_2".mp4", + "question_id": "v_yVgL8sJQxYo_2_5", + "question": "What tools are used to apply foundation on the skin between <5s> and <60s>?", + "ref_answer": "A brush and blender.", + "duration": 102.002002002002, + "pred_answer": "A brush", + }, + { + "video": "yVgL8sJQxYo_2.mp4", # not a must-to-have field + "question": "How long does the action of applying foundation take?", + "question_id": "v_yVgL8sJQxYo_2_5" + "ref_answer": "The action takes around 55 seconds (<60s> - <5s>)." + "duration": 102.002002002002, # not a must-to-have field + "pred_answer": "This action takes around 50 seconds.", + } + + ... +] + +`video` and `duration` are two optional fields. If not provided, the script will ignore them. + +Notice that the time token here is represented as '<%ss>'.format(time_in_seconds). + +For the external LLM API, we use `meta/llama3-70b-instruct"` as an example. +You can go to: https://build.nvidia.com/explore/discover to choose the one that fits your needs. +Notice the API might be a little bit different. + +You also need an `API_TOKEN` from here: https://build.nvidia.com/explore/discover#llama3-70b +Click the `Get API Key` and save your key in the environment variable `API_TOKEN`. + +USAGE: +API_TOKEN= python eval_qa.py --input_file --output_dir --save_mid_result +""" + +import argparse +import ast +import json +import os +import re + +import requests + + +def parse_args(): + parser = argparse.ArgumentParser(description="Evaluate Video Question Answering task.") + parser.add_argument("--input_file", type=str, required=True, help="Path to the prediction file. json list file") + parser.add_argument("--output_dir", type=str, required=True, help="Path to the output directory.") + parser.add_argument("--save_mid_result", action="store_true", help="Whether to save the intermediate results.") + return parser.parse_args() + + +INVOKE_URL = "https://integrate.api.nvidia.com/v1/chat/completions" +# MODEL="mistralai/mixtral-8x22b-instruct-v0.1" # no `system` role +MODEL = "meta/llama3-70b-instruct" + + +def request_nvidia_api(messages): + API_TOKEN = os.getenv("API_TOKEN", "") # ADD NGC API TOKEN HERE + if not API_TOKEN: + raise ValueError("Please provide the API_TOKEN in the environment variable.") + headers = { + "Authorization": f"Bearer {API_TOKEN}", + "accept": "text/event-stream", + "content-type": "application/json", + } + payload = { + "model": MODEL, + "messages": messages, + "temperature": 0.5, + "top_p": 1.0, + "max_tokens": 2048, + "seed": 42, + "stream": True, + } + invoke_url = INVOKE_URL + response = requests.post(invoke_url, headers=headers, json=payload, stream=True) + output = "" + for line in response.iter_lines(): + if line == b'data: [DONE]': + break + if line: + res = json.loads(line.decode("utf-8").split("data: ")[1]) + if 'content' in res['choices'][0]['delta']: + output += res['choices'][0]['delta']['content'] + return output.lstrip().strip() + + +def convert_time_token(text): + # use regular expression to convert <12> <56> to <12s> <56s> + return re.sub(r'<(\d+)>', r'<\1s>', text) + + +def get_result(question, answer, pred, key, output_dir, save_mid_result=False): + messages = [ + { + "role": "system", + "content": "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. " + "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Focus on the meaningful match between the predicted answer and the correct answer.\n" + "- Consider synonyms or paraphrases as valid matches.\n" + "- Evaluate the correctness of the prediction compared to the answer.", + }, + { + "role": "user", + "content": "Please evaluate the following video-based question-answer pair:\n\n" + f"Question: {question}\n" + f"Correct Answer: {answer}\n" + f"Predicted Answer: {pred}\n\n" + "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. " + "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING." + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " + "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}.", + }, + ] + try: + response_message = request_nvidia_api(messages) + response_dict = ast.literal_eval(response_message) + except Exception as e: + print(f"Error processing file {key}: {e}") + return [] + qa_set = {"question": question, "ref_answer": answer, "pred_answer": pred} + result_qa_pair = [response_dict, qa_set] + if save_mid_result: + with open(f"{output_dir}/{key}.json", "w") as f: + json.dump(result_qa_pair, f) + return result_qa_pair + + +def main(): + args = parse_args() + input_file = args.input_file + output_dir = args.output_dir + save_mid_result = args.save_mid_result + with open(input_file, "r") as f: + data = json.load(f) + + tasks = [] + key = 0 + for item in data: + question = item["question"] + item["ref_answer"] = convert_time_token(item["ref_answer"]) + tasks.append((question, item["ref_answer"], item["pred_answer"], key, output_dir, save_mid_result)) + key += 1 + + # TODO: parallelize the requests + results = [] + while len(tasks) > 0: + task = tasks.pop() + key = task[3] + cur_result = get_result(*task) + if cur_result == []: + tasks.append(task) + continue + results.append((key, cur_result)) + + score_sum = count = yes_count = no_count = 0 + for key, result in results: + try: + count += 1 + score_sum += int(result[0]["score"]) + + if "yes" in result[0]["pred"].lower(): + yes_count += 1 + elif "no" in result[0]["pred"].lower(): + no_count += 1 + except Exception as e: + print(f"Error processing file {key}") + + average_score = score_sum / count + accuracy = yes_count / (yes_count + no_count) + result_file = os.path.join(output_dir, "metrics.json") + metrics = { + "average_score": average_score, + "accuracy": accuracy, + "no_count": no_count, + "yes_count": yes_count, + "model": MODEL, + } + print("Metrics: ", metrics) + with open(result_file, "w") as f: + json.dump(metrics, f, indent=2) + + +if __name__ == "__main__": + main() diff --git a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py index dcc79029463c..75d8a907b796 100644 --- a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py +++ b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py @@ -15,7 +15,7 @@ import json import os import torch -from torch.utils.data import Dataset +from torch.utils.data import DataLoader, Dataset from nemo.collections.multimodal.parts.utils import create_neva_model_and_processor from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam @@ -36,24 +36,109 @@ raise EnvironmentError("GPU is needed for the inference") -class RequestDataSet(Dataset): - def __init__(self, sentences): - super().__init__() - self.sentences = sentences - - def __len__( +class TemporalNevaDataset(Dataset): + def __init__( self, + prompt_dicts, + media_base_path, + media_token, + insert_media_token=None, + image_processor=None, + video_processor=None, + add_media_sep=False, ): - return len(self.sentences) + self.prompt_dicts = prompt_dicts + self.media_token = media_token + self.insert_media_token = insert_media_token + self.media_base_path = media_base_path + self.image_processor = image_processor + self.video_processor = video_processor + self.add_media_sep = add_media_sep + # [(media_name, [prompt_dict, prompt_dict, ...]), ...} + self.media_prompt_list = [] + self.group_by_media(media_token) + + def group_by_media(self, media_token): + """ + This function groups the prompt dicts by the media/video/image file name + """ + media_dict = {} + media = media_token.lstrip('<').rstrip('>') + for prompt_dict in self.prompt_dicts: + media_name = prompt_dict[media] # video or image file name + if media_name not in media_dict: + media_dict[media_name] = [] + media_dict[media_name].append(prompt_dict) + self.media_prompt_list = list(media_dict.items()) + + def __len__(self) -> int: + return len(self.media_prompt_list) + + def __getitem__(self, idx) -> dict: + """ + Return a list of prompt dicts for the idx-th media + For a single media file, only one media feature is returned + This would help improve performance as well as save GPU memory + """ + prompt_dict_list = self.media_prompt_list[idx][1] + cur_item = [] + cur_media_feature = None + for prompt_dict in prompt_dict_list: + if 'prompt' not in prompt_dict: + prompt_dict['prompt'] = prompt_dict['text'] if 'text' in prompt_dict else prompt_dict['question'] + if self.insert_media_token == 'left': + if self.add_media_sep: + prompt_dict['prompt'] = self.media_token + " \n" + prompt_dict['prompt'] + else: + prompt_dict['prompt'] = self.media_token + prompt_dict['prompt'] + elif self.insert_media_token == 'right': + if self.add_media_sep: + prompt_dict['prompt'] = prompt_dict['prompt'] + self.media_token + " \n" + else: + prompt_dict['prompt'] = prompt_dict['prompt'] + self.media_token + if 'image' in prompt_dict: + prompt_dict['image_path'] = prompt_dict['image'] + image_path = os.path.join(self.media_base_path, prompt_dict['image']) + if cur_media_feature is None: + cur_media_feature = ("image", self.image_processor(image_path)) + if 'video' in prompt_dict: + prompt_dict['video_path'] = prompt_dict['video'] + video_path = os.path.join(self.media_base_path, prompt_dict['video']) + if cur_media_feature is None: + cur_media_feature = ("video", self.video_processor(video_path)) + cur_item.append(prompt_dict) + return cur_media_feature, cur_item + - def __getitem__(self, idx): - return self.sentences[idx] +def collate_function(batch): + # do nothing + return batch + + +def do_inference(dataloader, model, length_params, sampling_params, cfg): + responses = [] + all_prompts = [] + for idx, batch_media_prompts in enumerate(dataloader): + if idx % 10 == 0: + print(f"Processed {idx} batch media") + for media_media_feature, prompts in batch_media_prompts: + media, media_feature = media_media_feature + all_prompts.extend(prompts.copy()) + for prompt in prompts: + prompt[media] = media_feature + cur_batch_responses = model.generate( + input_prompts=prompts, + length_params=length_params, + sampling_params=sampling_params, + inference_config=cfg, + ) + responses.extend(cur_batch_responses) + return responses, all_prompts @hydra_runner(config_path="conf", config_name="neva_inference") def main(cfg) -> None: model, image_processor, video_processor = create_neva_model_and_processor(cfg) - length_params: LengthParam = { "max_length": cfg.inference.tokens_to_generate, "min_length": cfg.inference.min_tokens_to_generate, @@ -71,35 +156,43 @@ def main(cfg) -> None: "end_strings": cfg.inference.end_strings, } - with open(cfg.prompt_file, 'r') as f: - lines = f.readlines() + prompt_dicts = [] + if cfg.prompt_file.endswith('.json'): + with open(cfg.prompt_file, 'r') as f: + prompt_dicts = json.load(f) + elif cfg.prompt_file.endswith('.jsonl'): + with open(cfg.prompt_file, 'r') as f: + lines = f.readlines() + for line in lines: + prompt_dicts.append(json.loads(line)) + else: + raise ValueError(f"Unsupported prompt file format: {cfg.prompt_file}") media_type_token = cfg.inference.get("media_type", "image") media_token = f"<{media_type_token}>" insert_media_token = cfg.inference.get("insert_media_token", None) - final_prompts = [] - for line in lines: - prompt_dict = json.loads(line) - assert 'prompt' in prompt_dict or 'text' in prompt_dict - if 'prompt' not in prompt_dict: - prompt_dict['prompt'] = prompt_dict['text'] - if insert_media_token == 'left': - prompt_dict['prompt'] = media_token + prompt_dict['prompt'] - elif insert_media_token == 'right': - prompt_dict['prompt'] = prompt_dict['prompt'] + media_token - if 'image' in prompt_dict: - prompt_dict['image_path'] = prompt_dict['image'] - prompt_dict['image'] = image_processor(os.path.join(cfg.inference.media_base_path, prompt_dict['image'])) - if 'video' in prompt_dict: - prompt_dict['video_path'] = prompt_dict['video'] - prompt_dict['video'] = video_processor(os.path.join(cfg.inference.media_base_path, prompt_dict['video'])) - final_prompts.append(prompt_dict) - - responses = model.generate( - input_prompts=final_prompts, length_params=length_params, sampling_params=sampling_params, inference_config=cfg + dataset = TemporalNevaDataset( + prompt_dicts, + cfg.inference.media_base_path, + media_token, + insert_media_token, + image_processor, + video_processor, + cfg.get("add_media_sep", False), ) + num_workers = 2 + dataloader = DataLoader( + dataset, + batch_size=cfg.inference.get("batch_size", 1), + shuffle=False, + collate_fn=collate_function, + num_workers=num_workers, + persistent_workers=True, + ) + responses, final_prompts = do_inference(dataloader, model, length_params, sampling_params, cfg) + # =================== Start Quantization ==================== if HAVE_MODELOPT and cfg.quantization.enable == True: print(f"Using quantization algorithm: {cfg.quantization.algorithm}") @@ -113,21 +206,33 @@ def main(cfg) -> None: raise ValueError(f"Unsupported quantization algorithm: {cfg.quantization.algorithm}") def forward_loop(): - model.generate( - input_prompts=final_prompts, - length_params=length_params, - sampling_params=sampling_params, - inference_config=cfg, + num_samples = cfg.quantization.get("num_samples", 100) + if num_samples == -1: + cur_prompt_dicts = prompt_dicts + else: + cur_prompt_dicts = prompt_dicts[:num_samples] + cur_dataset = TemporalNevaDataset( + cur_prompt_dicts, + cfg.inference.media_base_path, + media_token, + insert_media_token, + image_processor, + video_processor, + cfg.get("add_media_sep", False), ) + cur_dataloader = DataLoader( + cur_dataset, + batch_size=cfg.inference.get("batch_size", 1), + shuffle=False, + collate_fn=collate_function, + num_workers=num_workers, + ) + _, _ = do_inference(cur_dataloader, model, length_params, sampling_params, cfg) mtq.quantize(model, mtq_config, forward_loop) - responses = model.generate( - input_prompts=final_prompts, - length_params=length_params, - sampling_params=sampling_params, - inference_config=cfg, - ) + responses, final_prompts = do_inference(dataloader, model, length_params, sampling_params, cfg) + # ============== Quantization End ========================= # PP middle stages do not yield any responses @@ -138,7 +243,7 @@ def forward_loop(): results = [] for response, prompt in zip(responses, final_prompts): prompt['full_text'] = response["clean_text"] - prompt['text'] = response["clean_response"] + prompt['pred_answer'] = response["clean_response"] prompt['model_id'] = cfg.neva_model_file if 'image_path' in prompt: prompt['image'] = prompt.pop('image_path') @@ -151,8 +256,11 @@ def forward_loop(): results.append(prompt) with open(cfg.output_file, 'w') as f: - for result in results: - f.write(json.dumps(result) + '\n') + if cfg.output_file.endswith('.json'): + json.dump(results, f, indent=2) + else: + for result in results: + f.write(json.dumps(result) + '\n') if __name__ == '__main__': diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py index 10a6c9e7283d..2e110eebe9e6 100644 --- a/nemo/collections/multimodal/data/neva/conversation.py +++ b/nemo/collections/multimodal/data/neva/conversation.py @@ -34,6 +34,10 @@ DEFAULT_IM_START_TOKEN["llama_3"] = "<|reserved_special_token_4|>" DEFAULT_IM_END_TOKEN["llama_3"] = "<|reserved_special_token_5|>" +DEFAULT_VID_START_TOKEN = "" +DEFAULT_VID_END_TOKEN = "" +TIME_TOKEN_TEMPLATE = "" + class SeparatorStyle(Enum): """Different separator style.""" diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py index 7eef677e13a8..b56c42fff274 100644 --- a/nemo/collections/multimodal/data/neva/neva_dataset.py +++ b/nemo/collections/multimodal/data/neva/neva_dataset.py @@ -34,11 +34,15 @@ import nemo.collections.multimodal.data.neva.conversation as conversation_lib from nemo.collections.multimodal.data.clip.augmentations.augmentations import image_transform from nemo.collections.multimodal.data.neva.conversation import ( + DEFAULT_BOS_TOKEN, + DEFAULT_EOS_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IMAGE_TOKEN, DEFAULT_LABELS_TOKEN, + DEFAULT_VID_END_TOKEN, + DEFAULT_VID_START_TOKEN, DEFAULT_VIDEO_TOKEN, ) from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids @@ -145,7 +149,7 @@ def open_video(self, file_name): cap = decord.VideoReader(f) return self.flatten_frames(cap) else: - decord.bridge.set_bridge("torch") + # decord.bridge.set_bridge("torch") cap = decord.VideoReader(os.path.join(self.video_folder, file_name)) return self.flatten_frames(cap) return None @@ -171,9 +175,7 @@ def flatten_frames(self, cap): else: num_frames = min(len(cap), self.data_cfg['num_frames']) indices = np.linspace(0, len(cap) - 1, num_frames, dtype=int) - frames = [] - frames = cap.get_batch(indices) - + frames = [Image.fromarray(cap[i].asnumpy()).convert('RGB') for i in indices] while len(frames) < self.data_cfg['num_frames']: frames.append(frames[-1]) return frames @@ -226,6 +228,25 @@ def tokenize( return result +def get_tokens_ids(tokenizer, tokens): + """ + Returns the token id for a given token. + + Parameters + ---------- + tokenizer : nemo tokenizer + A tokenizer to be used for tokenization. + tokens : list + A list of tokens to get the token id for. + + Returns + ------- + List + The token ids. + """ + return [tokenizer.token_to_id(token) for token in tokens] + + def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: int, use_plain: bool = False) -> Dict: """ Preprocesses multimodal sources based on the provided configuration. @@ -259,13 +280,15 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in if not is_multimodal: return sources - num_patches = image_token_len + num_frames = multimodal_cfg['num_frames'] + # vila + if multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample': + image_token_len //= 4 + num_patches = image_token_len + # TO DO: to support multiple images if media_type == 'video': - num_patches *= multimodal_cfg['num_frames'] - - if multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample': - num_patches //= 4 + num_patches *= num_frames if multimodal_cfg['use_im_start_end']: replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * num_patches @@ -273,6 +296,44 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * (num_patches - 2) replace_token = DEFAULT_IM_START_TOKEN[model_type] + replace_token + DEFAULT_IM_END_TOKEN[model_type] + if media_type == 'video' and multimodal_cfg.get("use_lita", False): + if not multimodal_cfg.get('lita', None): + raise ValueError("LITA config is missing") + lita_video_arch = multimodal_cfg['lita']['lita_video_arch'] + num_temporal_tokens, num_spatial_tokens = num_frames, 0 + if lita_video_arch == 'temporal_all_resolution': + sample_frames = min(multimodal_cfg['lita']['sample_frames'], num_frames) + # num_frames for temporal tokens, sample_frames * num_patches for spatial tokens + num_spatial_tokens = sample_frames * image_token_len + else: + # num_frames for temporal tokens and num_patches for spatial tokens + num_spatial_tokens = image_token_len + num_tokens = num_temporal_tokens + num_spatial_tokens + + visual_token_format = multimodal_cfg['lita'].get('visual_token_format', 'v1') + media_start = DEFAULT_IM_START_TOKEN[model_type] + media_end = DEFAULT_IM_END_TOKEN[model_type] + image_patch = DEFAULT_IMAGE_PATCH_TOKEN[model_type] + if visual_token_format == 'im_vid_start_end': + image_start, image_end = DEFAULT_IM_START_TOKEN[model_type], DEFAULT_IM_END_TOKEN[model_type] + vid_start, vid_end = DEFAULT_VID_START_TOKEN, DEFAULT_VID_END_TOKEN + if multimodal_cfg['use_im_start_end']: + replace_token_list = [image_start + image_patch * image_token_len + image_end] * sample_frames + replace_token_list += [vid_start + image_patch * num_temporal_tokens + vid_end] + replace_token = "".join(replace_token_list) + else: + replace_token_list = [image_start + image_patch * (image_token_len - 1) + image_end] + replace_token_list += [image_start + image_patch * image_token_len + image_end] * (sample_frames - 1) + replace_token_list += [vid_start + image_patch * (num_temporal_tokens - 1) + vid_end] + replace_token = "".join(replace_token_list) + replace_token = media_start + replace_token + media_end + else: + if multimodal_cfg['use_im_start_end']: + replace_token = image_patch * num_tokens + else: + replace_token = image_patch * (num_tokens - 2) + replace_token = media_start + replace_token + media_end + for source in sources: conversation = source['conversations'] if multimodal_cfg['sep_image_conv_front']: @@ -290,7 +351,6 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in conversation[0]['value'] = default_token for turn in conversation: turn["value"] = turn["value"].replace(default_token, replace_token) - return sources @@ -475,9 +535,13 @@ def preprocess_llama_2( ) # llama tricks - tokens[tokens == 32003] = 0 # DEFAULT_IMAGE_PATCH_TOKEN - tokens[tokens == 32006] = 1 # - tokens[tokens == 32007] = 2 # + # 32003, 32006, 32007 + image_patch_token = DEFAULT_IMAGE_PATCH_TOKEN["llama_2"] + DEFAULT_TOKENS = [image_patch_token, DEFAULT_BOS_TOKEN, DEFAULT_EOS_TOKEN] + img_patch_id, bos_id, eos_id = get_tokens_ids(tokenizer, DEFAULT_TOKENS) + tokens[tokens == img_patch_id] = 0 # DEFAULT_IMAGE_PATCH_TOKEN + tokens[tokens == bos_id] = 1 # + tokens[tokens == eos_id] = 2 # labels = tokens.clone().detach() # Mask labels @@ -577,9 +641,14 @@ def preprocess_v1( ) # llama tricks - tokens[tokens == 32003] = 0 # DEFAULT_IMAGE_PATCH_TOKEN - tokens[tokens == 32006] = 1 # - tokens[tokens == 32007] = 2 # + # 32003, 32006, 32007 + image_patch_token = DEFAULT_IMAGE_PATCH_TOKEN["llama_2"] + DEFAULT_TOKENS = [image_patch_token, DEFAULT_BOS_TOKEN, DEFAULT_EOS_TOKEN] + img_patch_id, bos_id, eos_id = get_tokens_ids(tokenizer, DEFAULT_TOKENS) + tokens[tokens == img_patch_id] = 0 # DEFAULT_IMAGE_PATCH_TOKEN + tokens[tokens == bos_id] = 1 # + tokens[tokens == eos_id] = 2 # + # tokens = torch.concat((torch.tensor([[1]]), tokens), axis=1) #lita 1.5 legacy labels = tokens.clone().detach() # Mask labels @@ -977,7 +1046,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]: frames = self.video_loader.open_video(video_file) if frames is None: logging.warning(f"Video {video_file} could not be found!") - if isinstance(self.processor, CLIPImageProcessor): + if isinstance(self.processor, CLIPImageProcessor) or isinstance(self.processor, SiglipImageProcessor): # image processor from HF if self.multimodal_cfg['image_aspect_ratio'] == 'keep': max_hw, min_hw = max(frames.size), min(frames.size) @@ -1268,6 +1337,8 @@ def make_supervised_data_module(tokenizer, image_processor, model_cfg) -> Dict: context_length=model_cfg.encoder_seq_length, media_type=data_cfg.get('media_type', 'image'), num_frames=data_cfg.get('num_frames', -1), + use_lita=getattr(model_cfg.mm_cfg, 'use_lita', False), + lita=getattr(model_cfg.mm_cfg, 'lita', {}), mm_mlp_adapter_type=model_cfg.mm_cfg.get('mm_mlp_adapter_type', 'linear'), ), data_cfg=dict( diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index 376237e89ecc..92f13c28c287 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -17,9 +17,10 @@ from itertools import chain from typing import Any, Optional +import numpy as np import torch import torch.nn.functional as F -from einops import rearrange, repeat +from einops import rearrange, reduce, repeat from omegaconf.dictconfig import DictConfig from pkg_resources import packaging from pytorch_lightning.trainer.trainer import Trainer @@ -137,6 +138,7 @@ def init_vision( media_start_id, media_end_id, vision_select_layer=-1, + vision_select_feature="patch", class_token_length=1, use_im_start_end=False, ): @@ -147,6 +149,7 @@ def init_vision( self.class_token_length = class_token_length self.use_im_start_end = use_im_start_end self.vision_select_layer = vision_select_layer + self.vision_select_feature = vision_select_feature self.media = None self.set_accepted_adapter_types([MultimodalProjectorAdapterConfig._target_]) @@ -208,7 +211,10 @@ def encode_vision_x(self, vision_x: torch.Tensor): self.vision_encoder.backbone.transformer.return_select_layer = self.vision_select_layer vision_x = self.vision_encoder(vision_x) vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F) - vision_x = vision_x[:, :, :, self.class_token_length :] + if self.vision_select_feature == "patch": + vision_x = vision_x[:, :, :, self.class_token_length :] + elif self.vision_select_feature != "cls_patch": + raise ValueError(f"Unsupported vision_select_feature {self.vision_select_feature}") assert self.is_adapter_available(), "Cannot find multimodal vision adapter!" vision_connector = self.get_adapter_module(AdapterName.MULTIMODAL_PROJECTOR_ADAPTER) vision_x = vision_connector(vision_x) @@ -273,6 +279,147 @@ def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = (), **kw return sharded_state_dict +class LitaWordEmbeddingMixin(NevaWordEmbeddingMixin): + def init_lita( + self, + lita_video_arch: str, + visual_token_format: str = "v1", + use_media_start_end: bool = False, + sample_frames: int = 4, + ): + """_summary_ + + Args: + lita_video_arch (str): ['temporal_spatial_pool', 'temporal_spatial', 'temporal_all_resolution'] + visual_token_format (str, optional): default to 'v1', other option ["v1", "im_vid_start_end"] + v1: no video_start_id and video_end_id, video tokens are inserted between fast/slow (temporal/spatial) tokens + im_vid_start_end: video start and end tokens are inserted before and after temporal tokens + image start and end tokens are inserted before and after spatial tokens + use_media_start_end (bool, optional): + whether media start and media end is used in input_ids, Defaults to False. + Notice, when it is false, the media_start_id and media_end_id will play as an placeholder + input_ids = [..., media_start_id, t1, t2, t3...., media_end_id, ...] + use_media_start_end = False + we will replace the tokens including and between: [media_start_id, ... media_end_id] + use_media_start_end = True + we will replace the tokens between: (media_start_id, ... media_end_id) + num_frames (int, optional): number of frames to sample from the video, default to 4 + """ + self.lita_video_arch = lita_video_arch + self.visual_token_format = visual_token_format + self.use_media_start_end = use_media_start_end + self.sample_frames = sample_frames + + def add_lita_layer(self, media_features): + """_summary_ + + Args: + media_features (torch.Tensor): + feature after encoded by vision encoder + shape: Batch, T (number of images), S (num patches), H (hidden size) + Returns: + tokens (torch.Tensor): + shape: Batch, T + M, D (hidden size) + """ + + b, T, S, H = media_features.shape + tokens = media_features + if self.lita_video_arch == 'temporal_spatial_pool': + pool_size = 2 + h = w = int(np.sqrt(S)) + selected_frames = np.round(np.linspace(0, tokens.shape[1] - 1, pool_size * pool_size)).astype(int) + s_tokens = tokens[:, selected_frames, ...] + s_tokens = rearrange(s_tokens, 'b t (h w) d -> (b t) d h w', h=h, w=w) + s_tokens = F.avg_pool2d(s_tokens, kernel_size=pool_size) + s_tokens = rearrange(s_tokens, '(b t) d h w -> b (t h w) d', b=b) # B, M, D + t_tokens = reduce(tokens, 'b t s d -> b t d', 'mean') + # tokens = torch.cat([t_tokens, s_tokens], dim=1) # B, T + M, D + return t_tokens, s_tokens + elif self.lita_video_arch == 'temporal_spatial': + t_tokens = reduce(tokens, 'b t s d -> b t d', 'mean') + s_tokens = reduce(tokens, 'b t s d -> b s d', 'mean') + # tokens = torch.cat([t_tokens, s_tokens], dim=1) # B, T + M, D + return t_tokens, s_tokens + elif self.lita_video_arch == 'temporal_all_resolution': + idx = np.round(np.linspace(0, tokens.shape[1] - 1, self.sample_frames)).astype(int) + im_features = tokens[:, idx, ...] # B, num_frames, S, D + # im_tokens = im_features.view(b, -1, H) # flatten the B, num_frames * S, D + im_tokens = im_features + vid_tokens = reduce(tokens, 'b t s d -> b t d', 'mean') + # s and t tokens have been changed position + return im_tokens, vid_tokens + else: + raise ValueError(f"Unknown video architecture: {self.lita_video_arch}") + + def replace_media_embeddings(self, input_ids, inputs_embeds, media): + """_summary_ + + Args: + input_ids (torch.tensor): The input token ids [B, T] + words_embeddings (torch.tensor): The input embeddings [B, T, D] + media (torch.Tensor): Vision input + shape (B, T_img, F, C, H, W) + """ + if input_ids.shape[1] == 1: + return inputs_embeds + + if media is None: + return inputs_embeds + if type(media) is list: + raise NotImplementedError("dynamic length of videos not supported yet, only fixed length of videos now") + # 1, 1, num_frames, 3, 244, 244 + media_features = self.encode_vision_x(media) # B T F S(eq) H(idden) + B, T, F, S, H = media_features.shape + assert T == 1, "multiple videos per sample not supported yet" + media_features = media_features.squeeze(1) + t_tokens, s_tokens = self.add_lita_layer(media_features) # B, T, D & B, M, D + T = t_tokens.shape[1] + M = s_tokens.shape[1] + inputs_embeds = inputs_embeds.clone() + for idx, input_id in enumerate(input_ids): + media_start_position = torch.where(input_id == self.media_start_id)[0] + media_end_position = torch.where(input_id == self.media_end_id)[0] + if self.visual_token_format != 'im_vid_start_end': + assert len(media_start_position) == 1, "Only 1 video per sample supported" + assert len(media_end_position) == 1, "Only 1 video per sample supported" + + media_start_position = media_start_position[0] + media_end_position = media_end_position[-1] + if self.use_media_start_end: + # replace the tokens between media_start_id and media_end_id + start, end = media_start_position + 1, media_end_position - 1 + else: + # replace the tokens including and between media_start_id and media_end_id + start, end = media_start_position, media_end_position + + if self.visual_token_format == 'v1': + t_token_start, t_token_end = start, start + T + s_token_start, s_token_end = start + T, start + T + M + assert s_token_end == end + 1, "Token replacement error" + inputs_embeds[idx, t_token_start:t_token_end] = temporal_tokens[idx] + inputs_embeds[idx, s_token_start:s_token_end] = spatial_tokens[idx] + elif self.visual_token_format == 'im_vid_start_end': # v1.5 lita + if not self.use_media_start_end: + # replace the media start and media end embedding with + # img_start and vid_end token embedding + inputs_embeds[idx, start] = inputs_embeds[idx, start + 1] + inputs_embeds[idx, end] = inputs_embeds[idx, end - 1] + # TO DO: To optimize the below codes + im_features, vid_features = t_tokens[idx], s_tokens[idx] + # im_feature: num_frames * S, D + emb_start = start + 1 # skip the img_start token + num_frames, S, D = im_features.shape + for i in range(num_frames): + inputs_embeds[idx, emb_start : emb_start + S] = im_features[i] + emb_start = emb_start + S + 2 # skip the img_end token and img_start token + T = vid_features.shape[0] + inputs_embeds[idx, emb_start : emb_start + T] = vid_features + assert emb_start + T == end + else: + raise ValueError(f"Unsupported visual_token_format {self.visual_token_format}") + return inputs_embeds + + class NevaBaseModel: """ Base class for a multimedia model integrating vision and language models. @@ -307,12 +454,24 @@ def __init__( # Monkey patch embedding if kwargs.get("pre_process", True): - extend_instance(self.embedding.word_embeddings, NevaWordEmbeddingMixin) + if not mm_cfg.get("use_lita", False): + extend_instance(self.embedding.word_embeddings, NevaWordEmbeddingMixin) + else: + extend_instance(self.embedding.word_embeddings, LitaWordEmbeddingMixin) + lita_conf = mm_cfg.get('lita', {}) + self.embedding.word_embeddings.init_lita( + lita_video_arch=lita_conf.get('lita_video_arch', 'temporal_spatial_pool'), + visual_token_format=lita_conf.get('visual_token_format', 'v1'), + use_media_start_end=mm_cfg.get('use_im_start_end', False), # we need to make this clear + sample_frames=lita_conf.get('sample_frames', 4), + ) + self.embedding.word_embeddings.init_vision( vision_encoder, media_start_id, media_end_id, vision_select_layer=mm_cfg.vision_encoder.get("vision_select_layer", -2), + vision_select_feature=mm_cfg.vision_encoder.get("vision_select_feature", "patch"), class_token_length=mm_cfg.vision_encoder.get("class_token_length", 1), use_im_start_end=mm_cfg.get("use_im_start_end", False), ) @@ -320,7 +479,11 @@ def __init__( def create_vision_encoder_and_processor(self, mm_cfg): # Initialize vision encoder and freeze it if mm_cfg.vision_encoder.get("from_hf", False): - if "clip" in mm_cfg.vision_encoder.from_pretrained: + if ( + "clip" in mm_cfg.vision_encoder.from_pretrained + or "vit" in mm_cfg.vision_encoder.from_pretrained + or "clip" in mm_cfg.vision_encoder.get("model_type", "") + ): vision_encoder = CLIPVisionModel.from_pretrained( mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16, @@ -330,7 +493,9 @@ def create_vision_encoder_and_processor(self, mm_cfg): for param in vision_encoder.parameters(): param.requires_grad = False vision_encoder = vision_encoder.eval() - elif "siglip" in mm_cfg.vision_encoder.from_pretrained: + elif "siglip" in mm_cfg.vision_encoder.from_pretrained or "siglip" in mm_cfg.vision_encoder.get( + "model_type", "" + ): vision_encoder = SiglipVisionModel.from_pretrained( mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16, diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py index 5a01e8702a9e..75804b8acd00 100644 --- a/nemo/collections/multimodal/parts/utils.py +++ b/nemo/collections/multimodal/parts/utils.py @@ -466,7 +466,6 @@ def image_processor(maybe_image_path): def video_processor(maybe_video_path): if isinstance(maybe_video_path, str): - decord.bridge.set_bridge("torch") vr = decord.VideoReader(maybe_video_path) if neva_cfg.data.splice_single_frame == 'first': frames = [Image.fromarray(vr[0].asnumpy()).convert('RGB')] @@ -480,19 +479,23 @@ def video_processor(maybe_video_path): else: num_frames = min(len(vr), neva_cfg.data.num_frames) indices = np.linspace(0, len(vr) - 1, num_frames, dtype=int) - frames = vr.get_batch(indices) - + frames = [Image.fromarray(vr[i].asnumpy()).convert('RGB') for i in indices] while len(frames) < neva_cfg.data.num_frames: frames.append(frames[-1]) else: frames = maybe_video_path - if neva_cfg.mm_cfg.vision_encoder.from_hf: - processor = CLIPImageProcessor.from_pretrained( - neva_cfg.mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16 - ) + if neva_cfg.mm_cfg.vision_encoder.get("from_hf", False): + if ( + "siglip" in neva_cfg.mm_cfg.vision_encoder.from_pretrained + or "siglip" in neva_cfg.mm_cfg.vision_encoder.get("model_type", "") + ): + processor = SiglipImageProcessor.from_pretrained(neva_cfg.mm_cfg.vision_encoder.from_pretrained) + else: + # for clip and vit model + processor = CLIPImageProcessor.from_pretrained(neva_cfg.mm_cfg.vision_encoder.from_pretrained) else: - processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.bfloat16) + processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") # support single video inference if neva_cfg.data.image_aspect_ratio == 'keep': @@ -518,7 +521,7 @@ def expand2square(pil_img, background_color): result.paste(pil_img, ((height - width) // 2, 0)) return result - frames = [expand2square(frame, tuple(int(x * 255) for x in self.processor.image_mean)) for frame in frames] + frames = [expand2square(frame, tuple(int(x * 255) for x in processor.image_mean)) for frame in frames] frames = processor.preprocess(frames, return_tensors='pt')['pixel_values'] else: frames = processor.preprocess(frames, return_tensors='pt')['pixel_values'] @@ -531,11 +534,17 @@ def expand2square(pil_img, background_color): def create_image_processor(mm_cfg): if mm_cfg.vision_encoder.get("from_hf", False): - if "clip" in mm_cfg.vision_encoder.from_pretrained: + if ( + "clip" in mm_cfg.vision_encoder.from_pretrained + or "vit" in mm_cfg.vision_encoder.from_pretrained + or "clip" in mm_cfg.vision_encoder.get("model_type", "") + ): image_processor = CLIPImageProcessor.from_pretrained( mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16 ) - elif "siglip" in mm_cfg.vision_encoder.from_pretrained: + elif "siglip" in mm_cfg.vision_encoder.from_pretrained or "siglip" in mm_cfg.vision_encoder.get( + "model_type", "" + ): image_processor = SiglipImageProcessor.from_pretrained( mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16 ) diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index 8f8fe313a5e3..3b57b3988310 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -584,6 +584,7 @@ def __init__(self, model): media_type=getattr(self.data_cfg, 'media_type', 'image'), num_frames=getattr(self.data_cfg, 'num_frames', 1), mm_mlp_adapter_type=getattr(self.cfg.mm_cfg, 'mm_mlp_adapter_type', 'linear'), + use_lita=getattr(self.cfg.mm_cfg, 'use_lita', False), ) if self.multimodal_cfg['crop_size'] is None: image_processor = CLIPImageProcessor.from_pretrained( @@ -605,6 +606,21 @@ def __init__(self, model): width_num_patches += 1 self.num_media_latents = height_num_patches * width_num_patches + # add config for lita + if self.multimodal_cfg['use_lita']: + if self.cfg.mm_cfg.get('lita'): + lita = { + 'lita_video_arch': getattr(self.cfg.mm_cfg.lita, 'lita_video_arch', 'temporal_spatial_pool'), + 'visual_token_format': getattr(self.cfg.mm_cfg.lita, 'visual_token_format', 'v1'), + 'sample_frames': getattr(self.cfg.mm_cfg.lita, 'sample_frames', 1), + } + self.multimodal_cfg['lita'] = lita + else: + self.multimodal_cfg['use_lita'] = False + raise Warning( + 'Use lita has been set True but Lita config not found in the config file' + 'LITA will be disabled for this run.' + ) def clip_max_len(self, maxlen: int) -> int: """clip the max len based on the LM model max sequence length""" @@ -687,6 +703,7 @@ def prepare_batch_at_step( # not using type2use. uncomment it if it is used # if type_ids is not None: # types2use = type_ids[:, context_length - 1].view(batch_size, -1) + media = None """Prepare batch for each of the inference steps""" attention_mask_repeat = None diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py index cd02f5409679..1bd5b618de35 100644 --- a/nemo/collections/nlp/modules/common/text_generation_utils.py +++ b/nemo/collections/nlp/modules/common/text_generation_utils.py @@ -31,6 +31,8 @@ DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, + DEFAULT_VID_END_TOKEN, + DEFAULT_VID_START_TOKEN, ) from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids from nemo.collections.nlp.modules.common.text_generation_strategy import model_inference_strategy_dispatcher @@ -144,7 +146,75 @@ def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_para return output +def decode_time_tokens(tokenizer, text: str, duration: float, time_tokens: list[str], time_token_ids: list[int]): + """Decode the time tokens .... in the text to the actual time in seconds. + TO DO: to do time decoding on output ids instead of text + + Args: + text (str): _description_ + duration (float): the total length of the video in seconds + time_tokens (list[str]): list of time tokens [, , , ..] + time_token_ids (list[str]): list of time token ids [32004, 32005, ....] + """ + output_ids = tokenizer.text_to_ids(text) + num_time_tokens = len(time_token_ids) + # the original code is len(output_ids) - 1 + indices = [j for j in range(len(output_ids)) if output_ids[j] in time_token_ids] + last_processed = -1 + new_output_ids = [] + for j in range(len(indices)): + pred_seq = [int(output_ids[k]) for k in range(last_processed + 1, indices[j])] + new_output_ids.extend(pred_seq) + max_offset = num_time_tokens - 1 + time_token = tokenizer.ids_to_tokens([output_ids[indices[j]]])[0] + time_idx = time_tokens.index(time_token) + time = float(time_idx) * duration / max_offset + time = min(max(time, 0), duration) + time = round(time, 2) + # time_str = '<' + str(time) + '>' + time_str = '<%s>' % str(time) + new_output_ids.extend(tokenizer.text_to_ids(time_str)) + + last_processed = indices[j] + pred_seq = [int(x) for x in output_ids[last_processed + 1 :]] + new_output_ids.extend(pred_seq) + output_ids = new_output_ids + decoded_text = tokenizer.ids_to_text(output_ids) + return decoded_text + + +def encode_time_str(text: str, duration: float, num_time_tokens: int = 100, time_token_template: str = ""): + """ + Encode the common time expression to its time token expression + """ + + def time_to_string(time): + # time is normalized in [0, 1] + max_offset = float(num_time_tokens - 1) + time = int(np.round(max_offset * time)) + return time_token_template.format(t=time) + + def repl(match): + value = float(match.group(1)) / duration + return time_to_string(value) + f"" + + text = re.sub(r"<([\d.]{1,20})s>", repl, text) + text = re.sub(r"\s([\d.]{1,20})s[\s|\.|,|>]", repl, text) + text = re.sub(r"\s([\d.]{1,20}) seconds", repl, text) + text = re.sub(r"\s([\d.]{1,20}) second", repl, text) + + # This is to remove the timestamps from the text + text = re.sub(r"", "", text) + return text.strip() + + def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_params, inference_config, **strategy_args): + use_lita = model.cfg.mm_cfg.get('use_lita', False) + if use_lita: + num_time_tokens = model.cfg.data.get('num_time_tokens', 100) + TIME_TOKEN_TEMPLATE = "" + time_tokens = [TIME_TOKEN_TEMPLATE.format(t=i) for i in range(num_time_tokens)] + time_token_ids = model.tokenizer.tokens_to_ids(time_tokens) model_type = model.cfg.mm_cfg.llm.get("model_type", "nvgpt") conv_template = model.cfg.data.get("conv_template", "nvgpt") @@ -152,6 +222,14 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para for idx, prompt_dict in enumerate(prompt_dict_list): # determine the media type in the prompt_dict media_type_token = inference_config.inference.get("media_type", "image") + if use_lita: + if prompt_dict.get("duration") is not None: + duration = prompt_dict.get("duration") + prompt_dict['prompt'] = encode_time_str( + prompt_dict['prompt'], duration, num_time_tokens, TIME_TOKEN_TEMPLATE + ) + else: + print("duration field is not in prompt file, skipping time encoding.") response = generate( model, inputs=prompt_dict.get('prompt'), @@ -184,7 +262,12 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para r'|', r'\|' ) ) - combined_pattern = re.compile(f'{pattern.pattern}|{pattern_nvgpt.pattern}') + + if use_lita: + pattern_lita = re.compile(rf'{DEFAULT_IM_START_TOKEN[model_type]}(.)+{DEFAULT_IM_END_TOKEN[model_type]}') + combined_pattern = re.compile(f'{pattern_lita.pattern}') + else: + combined_pattern = re.compile(f'{pattern.pattern}|{pattern_nvgpt.pattern}') clean_text = re.sub(combined_pattern, f"<{media_type_token}>", response['sentences'][0]) clean_response = clean_text @@ -204,10 +287,18 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para clean_response = clean_response.rsplit("[/INST] ", 1)[-1] elif conv_template == "llama_3": clean_response = clean_response.rsplit("assistant<|end_header_id|>\n\n", 1)[-1] - clean_response = clean_response.rstrip("<|eot_id|>") + clean_response = re.sub(r"(<\|eot_id\|>)+$", "", clean_response) elif conv_template == "v1": clean_response = clean_response.rsplit("ASSISTANT: ", 1)[-1] + if use_lita: + if prompt_dict.get("duration", None) is not None: + duration = prompt_dict.get("duration") + clean_response = decode_time_tokens( + model.tokenizer, clean_response, duration, time_tokens, time_token_ids + ) + else: + print("duration field is not in prompt file, skipping time decoding.") clean_response = clean_response.strip() response["clean_text"] = clean_text response["clean_response"] = clean_response diff --git a/scripts/multimodal_dataset_conversion/convert_dvc_dataset_for_evaluation.py b/scripts/multimodal_dataset_conversion/convert_dvc_dataset_for_evaluation.py new file mode 100644 index 000000000000..1427e0983b24 --- /dev/null +++ b/scripts/multimodal_dataset_conversion/convert_dvc_dataset_for_evaluation.py @@ -0,0 +1,160 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +""" +This script is used to convert the DVC dataset to the format required by the model evaluation for RTL task. +The DVC dataset should have the below structure: +{ + "-4RXOT_UfpM_3": { # video_name is the unique video file name, extention is .mp4 + "duration": 118.01801801801803, + "timestamps": [ + [5, 58], + [66, 82], + [82, 96] + ], + "sentences": [ + "Apply eyeshadow on the lower area then crease with brush", + "Apply eyeshadow on the outer corner of eyes with brush", + "Apply eyeshadow on the outer half of eyes with brush", + ] + }, + ... +} + +The converted format will be as follows: +[ + { + "video": "-4RXOT_UfpM_3.mp4", + "question_id": "-4RXOT_UfpM_3_0", + "question": "When does \"Apply eyeshadow on the lower area then crease with brush\" happen in the video? Provide a response using only start and end timestamps.", + "ref_answer": "<5> <58> Apply eyeshadow on the lower area then crease with brush", + "duration": 118.01801801801803 + }, + { + "video": "-4RXOT_UfpM_3.mp4", + "question_id": "-4RXOT_UfpM_3_1", + "question": "When is \"Apply eyeshadow on the outer corner of eyes with brush\" depicted in the video? Convey your answer using start and end timestamps exclusively.", + "ref_answer": "<66> <82> Apply eyeshadow on the outer corner of eyes with brush", + "duration": 118.01801801801803 + }, + { + "video": "-4RXOT_UfpM_3.mp4", + "question_id": "-4RXOT_UfpM_3_2", + "question": "When does \"Apply eyeshadow on the outer half of eyes with brush\" happen in the video? Provide a response using only start and end timestamps.", + "ref_answer": "<82> <96> Apply eyeshadow on the outer half of eyes with brush", + "duration": 118.01801801801803 + }, + ..... +] + +For each sentence in the sentences list, we will generate one question for it and the answer will be the sentence itself with the timestamps. +USAGE: +python convert_dvc_dataset_for_evaluation.py --input --output_file --ratio + +""" + +import argparse +import json +import os +import random + + +class RTLConverter: + def __init__(self, input_file, output_file, sample_ratio, ext): + self.input_file = input_file + self.output_file = output_file + self.sample_ratio = sample_ratio + self.desc_prompts = [ + "When does \"%s\" happen in the video?", + "At what point in the video does \"%s\" happen?", + "When is \"%s\" depicted in the video?", + "At what time in the video does \"%s\" take place?", + ] + self.time_prompts = [ + "Answer the question only using start and end timestamps.", + "Provide a response using only start and end timestamps.", + "Convey your answer using start and end timestamps exclusively.", + ] + self.ext = ext + + def convert(self): + converted_data = [] + + # Load JSON data + with open(self.input_file, 'r') as file: + data = json.load(file) + + # Fix random seed for reproducibility + random.seed(42) + + # Randomly sample entries based on the sample ratio + vid_list = list(data.keys()) + sampled_vids = random.sample(vid_list, k=int(len(vid_list) * self.sample_ratio)) + + # Iterate through sampled entries + for vid in sampled_vids: + details = data[vid] + duration = details['duration'] + timestamps = details['timestamps'] + sentences = details['sentences'] + + # Iterate through sentences + for i, sentence in enumerate(sentences): + question_id = f"{vid}_{i}" + desc_prompt = random.choice(self.desc_prompts) + time_prompt = random.choice(self.time_prompts) + start_time, end_time = timestamps[i] + answer = f"<{start_time}> <{end_time}> {sentence}" + + # Construct question + question = (desc_prompt % sentence) + ' ' + time_prompt + + # Create entry in converted data + converted_data.append( + { + "video": vid + self.ext, + "question_id": question_id, + "question": question, + "ref_answer": answer, + "duration": duration, + } + ) + + # Ensure the output directory exists + os.makedirs(os.path.dirname(self.output_file), exist_ok=True) + + # Write converted data to output file + with open(self.output_file, 'w') as file: + json.dump(converted_data, file, indent=2) + + +def main(): + parser = argparse.ArgumentParser(description="Convert makeup QA JSON format") + parser.add_argument("--input", help="Input DVC JSON file", required=True) + parser.add_argument("--output_file", help="Output file", default="rtl_eval.json", required=True) + parser.add_argument("--ratio", help="Sampling ratio between 0 and 1", type=float, default=1.0, required=False) + parser.add_argument("--ext", help="Extension of the video files", default=".mp4", required=False) + args = parser.parse_args() + + if args.ratio < 0 or args.ratio > 1: + raise ValueError("Sampling ratio must be between 0 and 1") + + converter = RTLConverter(args.input, args.output_file, args.ratio, args.ext) + converter.convert() + + +if __name__ == "__main__": + main() diff --git a/scripts/multimodal_dataset_conversion/convert_dvc_dataset_for_training.py b/scripts/multimodal_dataset_conversion/convert_dvc_dataset_for_training.py new file mode 100644 index 000000000000..a80900e30004 --- /dev/null +++ b/scripts/multimodal_dataset_conversion/convert_dvc_dataset_for_training.py @@ -0,0 +1,322 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +""" +This script is used to convert the DVC dataset to the format required by the model training script. +The DVC dataset should have the below structure: +{ + "1043215450": { # video_name is the unique video file name (the extension should be .mp4) + "duration": 125.0, + "timestamps": [ + [0, 5], + [3, 9] + ], + "sentences": [ # For custom caption or event localization task + "Here is your caption 1", + "Here is your caption 2", + ], + "events": [ # For custom event task + "Event 1", + "Event 2", + ] + }, + ... +} + +The converted dataset format is as follows: +[ + # 1st example: dense video captioning (custom event or custom caption task) + { + "id": "xxxx", + "video: "xxxx.mp4", + "conversations": + [ + {"from": "human", "value": "