Skip to content

Commit

Permalink
Merge branch 'main' into aot/qwen-recipe
Browse files Browse the repository at this point in the history
  • Loading branch information
suiyoubi authored Oct 27, 2024
2 parents a4aafc8 + e13466f commit 4d893ff
Show file tree
Hide file tree
Showing 28 changed files with 1,329 additions and 59 deletions.
14 changes: 13 additions & 1 deletion .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,17 @@ on:
log:
description: Last 2000 characters of the test step's log
value: ${{ jobs.main.outputs.log }}
potential_infra_failure:
description: Boolean flag when infra-related keyword spotted in logs.
value: ${{ jobs.main.outputs.potential_infra_failure }}
jobs:

main:
runs-on: ${{ inputs.RUNNER }}
outputs:
conclusion: ${{ steps.main.conclusion }}
log: ${{ steps.main.outputs.log }}
potential_infra_failure: ${{ steps.main.outputs.potential_infra_failure }}
steps:
- name: Docker system cleanup
run: |
Expand All @@ -51,7 +55,12 @@ jobs:
- name: Start container
run: |
docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
ARG=("")
if [[ "${{ inputs.RUNNER }}" != *cpu* ]]; then
ARG=("--runtime=nvidia --gpus all")
fi
docker run --rm -d --name nemo_container_${{ github.run_id }} ${ARG[@]} --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
- id: main
name: Run main script
Expand All @@ -70,6 +79,9 @@ jobs:
echo "log=$(tail -c 2000 err.log | base64 -w 0)" >> "$GITHUB_OUTPUT"
potential_infra_failure=$(cat err.log | grep -Eqi "gpu|cuda|device" && echo true || echo false)
echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT"
exit $EXIT_CODE
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
Expand Down
16 changes: 10 additions & 6 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ jobs:
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
RUNNER: self-hosted-azure
TIMEOUT: 20
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
Expand Down Expand Up @@ -4515,7 +4515,10 @@ jobs:
if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' && env.SLACK_WEBHOOK != '' }}
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_ACTOR: ${{ github.actor }}
BRANCH: ${{ github.head_ref || github.ref_name }}
REPOSITORY: ${{ github.repository }}
RUN_ID: ${{ github.run_id }}
PR_NUMBER: ${{ github.event.number }}
Expand Down Expand Up @@ -4571,13 +4574,15 @@ jobs:
echo "* [$JOB_NAME]($JOB_URL)" | tee -a $GITHUB_STEP_SUMMARY
LOGS=$(echo $JOB | yq '(.value.outputs.log | @base64d)' | tr -d '"')
LOGS=$([[ $(echo $LOGS | wc -c) -gt 0 ]] && echo -E "\`\`\`\n$LOGS\n\`\`\`" || echo "")
LOGS=$([[ $(echo $JOB | yq '.value.outputs.potential_infra_failure') == "true" ]] && echo -E "$LOGS\n\ncc: $SLACK_WEBHOOK_ADMIN" || echo -E "$LOGS")
SUMMARY=$(echo "$SUMMARY" | jq \
--arg pr "<$PR_URL|$PR_TITLE>" \
--arg job "<$JOB_URL|$JOB_NAME>" \
--arg logs "$LOGS" \
--arg author "<https://github.com/${{ github.actor }}|${{ github.actor }}>" \
--arg branch "<https://github.com/$REPOSITORY/tree/${{ github.head_ref || github.ref_name }}|${{ github.head_ref || github.ref_name }}>"\
--arg logs "$(echo -e "$LOGS")" \
--arg author "<https://github.com/$GITHUB_ACTOR|$GITHUB_ACTOR>" \
--arg branch "<https://github.com/$REPOSITORY/tree/$BRANCH|$BRANCH>"\
'. += [
{
"type": "section",
Expand All @@ -4588,8 +4593,7 @@ jobs:
+ "\nJob: " + $job
+ "\nAuthor: " + $author
+ "\nBranch: " + $branch
+ "\nLogs:"
+ "```\n" + $logs + "\n```"
+ "\nLogs:" + $logs
)
}
}
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
# Install NeMo requirements
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.17.0
ARG MCORE_TAG=425cdd48d5ef5d360d8033288ff7cb0d378f535f
ARG MCORE_TAG=397e9da9511a09ae8badba30129c7e4934b06118

ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
Expand Down
119 changes: 119 additions & 0 deletions examples/audio/conf/masking_with_online_augmentation.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
name: "masking_with_online_augmenatation"

model:
sample_rate: 16000
skip_nan_grad: false
num_outputs: 1

train_ds:
use_lhotse: true # enable Lhotse data loader
cuts_path: ??? # path to Lhotse cuts manifest with speech signals for augmentation (including custom "target_recording" field with the same signals)
truncate_duration: 4.0 # Number of STFT time frames = 1 + truncate_duration // encoder.hop_length = 256
truncate_offset_type: random # if the file is longer than truncate_duration, use random offset to select a subsegment
batch_size: 64 # batch size may be increased based on the available memory
shuffle: true
num_workers: 8
pin_memory: true
rir_enabled: true # enable room impulse response augmentation
rir_path: ??? # path to Lhotse recordings manifest with room impulse response signals
noise_path: ??? # path to Lhotse cuts manifest with noise signals

validation_ds:
use_lhotse: true # enable Lhotse data loader
cuts_path: ??? # path to Lhotse cuts manifest with noisy speech signals (including custom "target_recording" field with the clean signals)
batch_size: 64 # batch size may be increased based on the available memory
shuffle: false
num_workers: 4
pin_memory: true

test_ds:
use_lhotse: true # enable Lhotse data loader
cuts_path: ??? # path to Lhotse cuts manifest with noisy speech signals (including custom "target_recording" field with the clean signals)
batch_size: 1 # batch size may be increased based on the available memory
shuffle: false
num_workers: 4
pin_memory: true

encoder:
_target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
fft_length: 512 # Length of the window and FFT for calculating spectrogram
hop_length: 256 # Hop length for calculating spectrogram

decoder:
_target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
fft_length: 512 # Length of the window and FFT for calculating spectrogram
hop_length: 256 # Hop length for calculating spectrogram

mask_estimator:
_target_: nemo.collections.audio.modules.masking.MaskEstimatorRNN
num_outputs: ${model.num_outputs}
num_subbands: 257 # Number of subbands of the input spectrogram
num_features: 256 # Number of features at RNN input
num_layers: 5 # Number of RNN layers
bidirectional: true # Use bi-directional RNN

mask_processor:
_target_: nemo.collections.audio.modules.masking.MaskReferenceChannel # Apply mask on the reference channel
ref_channel: 0 # Reference channel for the output

loss:
_target_: nemo.collections.audio.losses.SDRLoss
scale_invariant: true # Use scale-invariant SDR

metrics:
val:
sdr: # output SDR
_target_: torchmetrics.audio.SignalDistortionRatio
test:
sdr_ch0: # SDR on output channel 0
_target_: torchmetrics.audio.SignalDistortionRatio
channel: 0

optim:
name: adamw
lr: 1e-4
# optimizer arguments
betas: [0.9, 0.98]
weight_decay: 1e-3

trainer:
devices: -1 # number of GPUs, -1 would use all available GPUs
num_nodes: 1
max_epochs: -1
max_steps: -1 # computed at runtime if not set
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
accelerator: auto
strategy: ddp
accumulate_grad_batches: 1
gradient_clip_val: null
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
log_every_n_steps: 25 # Interval of logging.
enable_progress_bar: true
num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
sync_batchnorm: true
enable_checkpointing: False # Provided by exp_manager
logger: false # Provided by exp_manager

exp_manager:
exp_dir: null
name: ${name}
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
# in case of multiple validation sets, first one is used
monitor: "val_loss"
mode: "min"
save_top_k: 5
always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints

resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
# you need to set these two to true to continue the training
resume_if_exists: false
resume_ignore_no_checkpoint: false

# You may use this section to create a W&B logger
create_wandb_logger: false
wandb_logger_kwargs:
name: null
project: null
4 changes: 2 additions & 2 deletions nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,9 +646,9 @@ def __init__(
)

self._greedy_decode = RNNTGreedyDecodeCudaGraph(max_symbols_per_step, self)
except (ImportError, ModuleNotFoundError, ValueError) as e:
except (ImportError, ModuleNotFoundError, ValueError, EnvironmentError) as e:
self.use_cuda_graph_decoder = False
logging.warning(f"Cannot use decoder with CUDA graphs, reason: {e.msg}")
logging.warning(f"Cannot use decoder with CUDA graphs, reason: {e}")
self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
else:
self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,11 @@ def maybe_enable_cuda_graphs(self):
try:
check_cuda_python_cuda_graphs_conditional_nodes_supported()
self.cuda_graphs_mode = self.CudaGraphsMode.FULL_GRAPH
except (ImportError, ModuleNotFoundError) as e:
except (ImportError, ModuleNotFoundError, EnvironmentError) as e:
logging.warning(
"No conditional node support for Cuda.\n"
"Cuda graphs with while loops are disabled, decoding speed will be slower\n"
f"Reason: {e.msg}"
f"Reason: {e}"
)
self.cuda_graphs_mode = self.CudaGraphsMode.NO_WHILE_LOOPS
self.reset_cuda_graphs_state()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,11 +277,11 @@ def maybe_enable_cuda_graphs(self):
try:
check_cuda_python_cuda_graphs_conditional_nodes_supported()
self.cuda_graphs_mode = self.CudaGraphsMode.FULL_GRAPH
except (ImportError, ModuleNotFoundError) as e:
except (ImportError, ModuleNotFoundError, EnvironmentError) as e:
logging.warning(
"No conditional node support for Cuda.\n"
"Cuda graphs with while loops are disabled, decoding speed will be slower\n"
f"Reason: {e.msg}"
f"Reason: {e}"
)
self.cuda_graphs_mode = self.CudaGraphsMode.NO_WHILE_LOOPS
self.reset_cuda_graphs_state()
Expand Down
7 changes: 7 additions & 0 deletions nemo/collections/audio/data/audio_to_audio_lhotse.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ def __getitem__(self, cuts: CutSet) -> dict[str, torch.Tensor]:
retained_cuts = [
cut._first_non_padding_cut if isinstance(cut, MixedCut) else cut for cut in retained_padded_cuts
]

# if online augmentation is applied, some retained cuts still may be MixedCuts (including the original speech, noise, and augmentation)
# get the first non-padding cut from there, which is supposed to be the clean speech signal
for n, cut in enumerate(retained_cuts):
if isinstance(cut, MixedCut):
retained_cuts[n] = cut._first_non_padding_cut
# create cutset
retained_cuts = CutSet.from_cuts(retained_cuts)

if _key_available(retained_cuts, self.TARGET_KEY):
Expand Down
49 changes: 42 additions & 7 deletions nemo/collections/llm/inference/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from pathlib import Path
from typing import Optional
from typing import Optional, Union

import pytorch_lightning as pl
import torch
Expand All @@ -15,8 +30,9 @@
from pytorch_lightning.trainer.states import TrainerFn

import nemo.lightning as nl
from nemo.collections.llm.peft import LoRA
from nemo.lightning import io
from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
from nemo.lightning.ckpt_utils import ADAPTER_META_FILENAME, ckpt_to_context_subdir, ckpt_to_weights_subdir
from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
from nemo.lightning.pytorch.strategies.utils import RestoreConfig

Expand All @@ -39,11 +55,21 @@ def tokenize(self, prompt):
def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.LightningModule):
assert isinstance(trainer.strategy, MegatronStrategy), "Only MegatronStrategy is supported for trainer.strategy."
assert trainer.strategy.context_parallel_size <= 1, "Context parallelism is not supported for inference."
restore_config = RestoreConfig(
path=path,
load_model_state=True,
load_optim_state=False,
)
if (adapter_meta_path := ckpt_to_weights_subdir(path) / ADAPTER_META_FILENAME).exists():
with open(adapter_meta_path, "r") as f:
metadata = json.load(f)
restore_config = RestoreConfig(
path=metadata['model_ckpt_path'],
load_model_state=True,
load_optim_state=False,
)
else:
restore_config = RestoreConfig(
path=path,
load_model_state=True,
load_optim_state=False,
)

trainer.strategy.restore_config = restore_config
trainer.strategy._setup_optimizers = False
trainer.ckpt_path = None
Expand All @@ -60,6 +86,15 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.
trainer.strategy.trainer = trainer
trainer.strategy.selective_restore()

lora: Union[io.TrainerContext, LoRA] = io.load_context(ckpt_to_context_subdir(path), "model.model_transform")
if isinstance(lora, LoRA):
model = lora(model)
adapter_sharded_state_dict = {k: v for k, v in model.sharded_state_dict().items() if ".adapter." in k}
adapter_state = trainer.strategy.checkpoint_io.load_checkpoint(
ckpt_to_weights_subdir(path), sharded_state_dict=adapter_sharded_state_dict
)
trainer.strategy.load_model_state_dict(adapter_state, strict=False)


def setup_model_and_tokenizer(
path: Path,
Expand Down
3 changes: 2 additions & 1 deletion nemo/collections/llm/recipes/baichuan2_7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ def finetune_recipe(
num_nodes: int = 1,
num_gpus_per_node: int = 8,
peft_scheme: Optional[str] = 'lora',
packed_sequence: bool = False,
) -> run.Partial:
"""
Create a fine-tuning recipe for Baichuan2 7B model.
Expand Down Expand Up @@ -272,7 +273,7 @@ def finetune_recipe(
`examples/llm/finetune/` directory.
"""
recipe = default_finetune_recipe(
model(), "baichuan-inc/Baichuan2-7B-Base", dir, name, num_nodes, num_gpus_per_node
model(), "baichuan-inc/Baichuan2-7B-Base", dir, name, num_nodes, num_gpus_per_node, packed_sequence
)
if peft_scheme is None or peft_scheme.lower() == 'none':
recipe.trainer.strategy.tensor_model_parallel_size = 2
Expand Down
5 changes: 4 additions & 1 deletion nemo/collections/llm/recipes/chatglm3_6b.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ def finetune_recipe(
num_nodes: int = 1,
num_gpus_per_node: int = 8,
peft_scheme: Optional[str] = 'lora',
packed_sequence: bool = False,
) -> run.Partial:
"""
Create a fine-tuning recipe for ChatGLM3 6B model.
Expand Down Expand Up @@ -271,7 +272,9 @@ def finetune_recipe(
on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
`examples/llm/finetune/` directory.
"""
recipe = default_finetune_recipe(model(), "THUDM/chatglm3-6b", dir, name, num_nodes, num_gpus_per_node)
recipe = default_finetune_recipe(
model(), "THUDM/chatglm3-6b", dir, name, num_nodes, num_gpus_per_node, packed_sequence
)
if peft_scheme is None or peft_scheme.lower() == 'none':
recipe.trainer.strategy.tensor_model_parallel_size = 2
recipe.optim.config.lr = 5e-6
Expand Down
Loading

0 comments on commit 4d893ff

Please sign in to comment.