Update to using Model Optimizer (formerly AMMO) in PTQ workflow (#9178)

* Update PTQ to use nvidia-modelopt Signed-off-by: Jan Lasek <[email protected]> * Restore PTQ tests Signed-off-by: Jan Lasek <[email protected]> * Update docs Signed-off-by: Jan Lasek <[email protected]> * Comment on apply_rope_fusion Signed-off-by: Jan Lasek <[email protected]> * Support for calibration PP > 1 Signed-off-by: Jan Lasek <[email protected]> * Apply isort and black reformatting Signed-off-by: janekl <[email protected]> * Fix cicd-main.yml indent Signed-off-by: Jan Lasek <[email protected]> * Set data/tensor parallel groups Signed-off-by: Jan Lasek <[email protected]> * Install only torch dependecies Signed-off-by: Jan Lasek <[email protected]> * Follow up on recent modelopt changes Signed-off-by: Jan Lasek <[email protected]> * Model support matrix Signed-off-by: Jan Lasek <[email protected]> * Apply isort and black reformatting Signed-off-by: janekl <[email protected]> * Rename PTQ script as it should be model-agnostic Signed-off-by: Jan Lasek <[email protected]> * Remove unused import Signed-off-by: Jan Lasek <[email protected]> * Update setup instructions Signed-off-by: Jan Lasek <[email protected]> --------- Signed-off-by: Jan Lasek <[email protected]> Signed-off-by: janekl <[email protected]> Co-authored-by: janekl <[email protected]>
NVIDIA · May 15, 2024 · 6cb618a · 6cb618a
1 parent 1de4b49
commit 6cb618a
Show file tree

Hide file tree

Showing 9 changed files with 204 additions and 119 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -132,8 +132,8 @@ jobs:
                 apt-get update && apt-get install libsox-fmt-all -y && \
                 popd
 
-            # AMMO installation
-            pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
+            # ModelOpt installation
+            pip install nvidia-modelopt[torch]~=0.11.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
 
             # PyTorch Lightning version
             python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
@@ -394,7 +394,7 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v4
         - run: |
-            python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            python examples/nlp/language_modeling/megatron_quantization.py \
             model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
             quantization.algorithm=null \
             model_save=/home/TestData/nlp/megatron_llama/ci_baseline
@@ -403,69 +403,70 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  # L2_PTQ_Llama2_FP8:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   timeout-minutes: 10
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options:
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g
-  #       --env TRANSFORMERS_OFFLINE=0
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/nlp/language_modeling/megatron_llama_quantization.py \
-  #           model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-  #           tensor_model_parallel_size=2 \
-  #           trainer.devices=2 \
-  #           quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-  #           quantization.algorithm=fp8 \
-  #           quantization.num_calib_size=8 \
-  #           inference.batch_size=2 \
-  #           export.inference_tensor_parallel=2 \
-  #           model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
-
-  #           rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-  #         if: "failure()"
-
-  # L2_PTQ_Llama2_INT8_SQ:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   timeout-minutes: 10
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options:
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g
-  #       --env TRANSFORMERS_OFFLINE=0
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/nlp/language_modeling/megatron_llama_quantization.py \
-  #           model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-  #           quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-  #           quantization.algorithm=int8_sq \
-  #           quantization.num_calib_size=8 \
-  #           inference.batch_size=2 \
-  #           model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
-
-  #           rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-  #         if: "failure()"
-
+  L2_PTQ_Llama2_FP8:
+     needs: [cicd-test-container-setup]
+     runs-on: self-hosted-azure
+     timeout-minutes: 10
+     container:
+       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+       options:
+         # --user 0:128
+         --device=/dev/nvidia0
+         --gpus all
+         --shm-size=8g
+         --env TRANSFORMERS_OFFLINE=0
+         --env HYDRA_FULL_ERROR=1
+         --volume /mnt/datadrive/TestData:/home/TestData
+     steps:
+         - name: Checkout repository
+           uses: actions/checkout@v4
+         - run: |
+             python examples/nlp/language_modeling/megatron_quantization.py \
+             model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+             tensor_model_parallel_size=2 \
+             trainer.devices=2 \
+             quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+             quantization.algorithm=fp8 \
+             quantization.num_calib_size=8 \
+             inference.batch_size=2 \
+             export.inference_tensor_parallel=2 \
+             model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+
+             rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+           if: "failure()"
+
+  L2_PTQ_Llama2_INT8_SQ:
+     needs: [cicd-test-container-setup]
+     runs-on: self-hosted-azure
+     timeout-minutes: 10
+     container:
+       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+       options:
+         # --user 0:128
+         --device=/dev/nvidia0
+         --gpus all
+         --shm-size=8g
+         --env TRANSFORMERS_OFFLINE=0
+         --env HYDRA_FULL_ERROR=1
+         --volume /mnt/datadrive/TestData:/home/TestData
+     steps:
+         - name: Checkout repository
+           uses: actions/checkout@v4
+         - run: |
+             python examples/nlp/language_modeling/megatron_quantization.py \
+             model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+             quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+             quantization.algorithm=int8_sq \
+             quantization.num_calib_size=8 \
+             inference.batch_size=2 \
+             model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+
+             rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+           if: "failure()"
+
+  # TODO: investigate int4_awq stuck issues and restore the test
   #L2_PTQ_Llama2_INT4_AWQ:
   #  needs: [cicd-test-container-setup]
   #  runs-on: self-hosted-azure
@@ -484,7 +485,7 @@ jobs:
   #      - name: Checkout repository
   #        uses: actions/checkout@v4
   #      - run: |
-  #          python examples/nlp/language_modeling/megatron_llama_quantization.py \
+  #          python examples/nlp/language_modeling/megatron_quantization.py \
   #          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
   #          tensor_model_parallel_size=1 \
   #          trainer.devices=1 \

diff --git a/Dockerfile b/Dockerfile
@@ -133,8 +133,6 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec
 RUN pip install flash-attn
 # install numba for latest containers
 RUN pip install numba>=0.57.1
-# install ammo
-RUN pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
 
 # copy nemo source into a scratch image
 FROM scratch as nemo-src

diff --git a/docs/source/nlp/quantization.rst b/docs/source/nlp/quantization.rst
@@ -10,18 +10,60 @@ PTQ enables deploying a model in a low-precision format -- FP8, INT4, or INT8 --
 
 Model quantization has two primary benefits: reduced model memory requirements and increased inference throughput.
 
-In NeMo, quantization is enabled by the Nvidia AMMO library -- a unified algorithmic model optimization & deployment toolkit.
+In NeMo, quantization is enabled by the `NVIDIA TensorRT Model Optimizer (ModelOpt) <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ library -- a library to quantize and compress deep learning models for optimized inference on GPUs.
 
 The quantization process consists of the following steps:
 
 1. Loading a model checkpoint using an appropriate parallelism strategy
 2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
 3. Producing an output directory or .qnemo tarball with model config (json), quantized weights (safetensors) and tokenizer config (yaml).
 
-Loading models requires using an AMMO spec defined in `megatron.core.inference.gpt.model_specs.py <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/inference/gpt/model_specs.py>`_ module. Typically the calibration step is lightweight and uses a small dataset to obtain appropriate statistics for scaling tensors. The output directory produced (or a .qnemo tarball) is ready to be used to build a serving engine with the Nvidia TensorRT-LLM library. The engine build step is also available in NeMo project in ``nemo.deploy`` and ``nemo.export`` modules.
+Loading models requires using an ModelOpt spec defined in `nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_modelopt_spec <https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py>`_ module. Typically the calibration step is lightweight and uses a small dataset to obtain appropriate statistics for scaling tensors. The output directory produced (or a .qnemo tarball) is ready to be used to build a serving engine with the Nvidia TensorRT-LLM library. The engine build step is also available in NeMo project in ``nemo.deploy`` and ``nemo.export`` modules.
 
 Quantization algorithm can also be conveniently set to ``"null"`` to perform only the weights export step using default precision for TensorRT-LLM deployment. This is useful to obtain baseline performance and accuracy results for comparison.
 
+Support Matrix
+^^^^^^^^^^^^^^
+
+Table below presents verified model support matrix for popular LLM architectures. Each model entry also optionally provides a download link to a corresponding Nemo checkpoint for testing purposes. Support for other model families is experimental.
+
+.. list-table:: Model Support Matrix
+   :widths: 15 15 15 15
+   :header-rows: 1
+
+   * - **Model Family**
+     - **FP8**
+     - **INT8_SQ**
+     - **INT4_AWQ**
+   * - Llama (1, 2, 3)
+     - ✅
+     - ✅
+     - ✅
+   * - Mistral
+     - ✅
+     - ✅
+     - ✅
+   * - `GPT-3 <https://huggingface.co/nvidia/GPT-2B-001>`_
+     - ✅
+     - ✅
+     - ✅
+   * - `Nemotron-3 8b <https://huggingface.co/nvidia/nemotron-3-8b-base-4k>`_
+     - ✅
+     - ✅
+     - ✅
+   * - Nemotron-4 15b
+     - ✅
+     - ✅
+     - ✅
+   * - StarCoder 2
+     - ✅
+     - ✅
+     - ✅
+   * - Gemma
+     - ✅
+     - ✅
+     - ✅
+
 
 Example
 ^^^^^^^
@@ -31,7 +73,7 @@ The script must be launched correctly with the number of processes equal to tens
 
 .. code-block:: bash
 
-    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_llama_quantization.py \
+    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_quantization.py \
         model_file=llama2-70b-base-bf16.nemo \
         tensor_model_parallel_size=8 \
         pipeline_model_parallel_size=1 \

diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst
@@ -96,13 +96,13 @@ This section details the steps to clone and install the Megatron Core.
     git checkout a5415fcfacef2a37416259bd38b7c4b673583675 && \
     pip install .
 
-AMMO Installation
+Model Optimizer Installation
 
-This final step involves installing the AMMO package.
+This final step involves installing the Model Optimizer package.
 
 .. code-block:: bash
 
-    pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
+    pip install nvidia-modelopt[torch]~=0.11.0 --extra-index-url https://pypi.nvidia.com
 
 
 .. code-block:: bash

diff --git a/...ing/conf/megatron_llama_quantization.yaml → ..._modeling/conf/megatron_quantization.yaml b/...ing/conf/megatron_llama_quantization.yaml → ..._modeling/conf/megatron_quantization.yaml
diff --git a/...e_modeling/megatron_llama_quantization.py → ...anguage_modeling/megatron_quantization.py b/...e_modeling/megatron_llama_quantization.py → ...anguage_modeling/megatron_quantization.py
@@ -25,12 +25,12 @@
 Nemo quantization example script.
 
 Please consult nemo.export.quantize.Quantizer class
-and examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml config on available quantization methods,
+and examples/nlp/language_modeling/conf/megatron_quantization.yaml config on available quantization methods,
 models supported as well as how to set up data and inference for calibration (with defaults recommended).
 
 Example usage:
 ```
-python examples/nlp/language_modeling/megatron_llama_quantization.py \
+python examples/nlp/language_modeling/megatron_quantization.py \
     model_file=llama2-7b-fp16.nemo \
     model_save=llama2-7b-fp8.qnemo \
     quantization.algorithm=fp8 \
@@ -59,7 +59,7 @@ def get_calib_dataloader(data="cnn_dailymail", batch_size=64, calib_size=512, ma
         yield batch
 
 
-@hydra_runner(config_path="conf", config_name="megatron_llama_quantization")
+@hydra_runner(config_path="conf", config_name="megatron_quantization")
 def main(cfg) -> None:
     if not torch.cuda.is_available():
         raise EnvironmentError("GPU is required for the inference.")

diff --git a/..._modeling/megatron/gpt_layer_ammo_spec.py → ...eling/megatron/gpt_layer_modelopt_spec.py b/..._modeling/megatron/gpt_layer_ammo_spec.py → ...eling/megatron/gpt_layer_modelopt_spec.py
@@ -36,8 +36,9 @@
     HAVE_MEGATRON_CORE = False
     IMPORT_ERROR = e
 
-# Use this spec for AMMO PTQ and TensorRT-LLM export
-def get_gpt_layer_ammo_spec() -> ModuleSpec:
+
+# Use this spec for Model Optimizer PTQ and TensorRT-LLM export
+def get_gpt_layer_modelopt_spec() -> ModuleSpec:
     """Mix the native spec with TENorm.
 
     This is essentially the native local spec except for the layernorm implementation
@@ -65,7 +66,11 @@ def get_gpt_layer_ammo_spec() -> ModuleSpec:
             self_attn_bda=get_bias_dropout_add,
             pre_mlp_layernorm=TENorm,
             mlp=ModuleSpec(
-                module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,),
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=ColumnParallelLinear,
+                    linear_fc2=RowParallelLinear,
+                ),
             ),
             mlp_bda=get_bias_dropout_add,
             # Map TE-layernorm-fusion keys back

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -41,7 +41,7 @@
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_full_te_layer_autocast_spec import (
     get_gpt_full_te_layer_autocast_spec,
 )
-from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_ammo_spec import get_gpt_layer_ammo_spec
+from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_modelopt_spec import get_gpt_layer_modelopt_spec
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
@@ -154,7 +154,7 @@ def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True):
         "te_gpt": get_gpt_layer_with_transformer_engine_spec(num_experts, moe_grouped_gemm),
         "megatron_falcon_gpt": get_falcon_layer_spec(),
         "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(),
-        "ammo": get_gpt_layer_ammo_spec(),
+        "modelopt": get_gpt_layer_modelopt_spec(),
     }
     if spec_name not in name_spec_dict:
         raise ValueError(f"Spec name '{spec_name}' is not recognized.")