NVIDIA · paul-gibbons · May 11, 2024 · May 13, 2024 · May 13, 2024 · May 13, 2024
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
diff --git a/Dockerfile b/Dockerfile
@@ -133,8 +133,6 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec
 RUN pip install flash-attn
 # install numba for latest containers
 RUN pip install numba>=0.57.1
-# install ammo
-RUN pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
 
 # copy nemo source into a scratch image
 FROM scratch as nemo-src

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -0,0 +1,74 @@
+# syntax=docker/dockerfile:1-labs
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
+
+FROM ${BASE_IMAGE}
+
+ENV TRANSFORMERS_OFFLINE=0 
+ENV HYDRA_FULL_ERROR=1
+ENV PYTHONUNBUFFERED=1
+
+# APT packages
+RUN <<"EOF" bash -ex
+apt-get update
+apt-get install -y bc libsox-fmt-all -y 
+apt-get clean
+EOF
+
+WORKDIR /workspace
+
+# Install NeMo requirements
+ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
+ARG MODELOPT_VERSION=0.11.0
+ARG MCORE_TAG=c90aa1671fc0b97f80fa6c3bb892ce6f8e88e7c9
+ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
+RUN \
+--mount=type=bind,source=requirements,target=requirements \
+--mount=type=bind,source=tools,target=tools \
+--mount=type=bind,source=setup.py,target=setup.py \
+--mount=type=bind,source=nemo/package_info.py,target=nemo/package_info.py \
+--mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
+pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \
+"transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \
+"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
+"nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
+"apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
+-r tools/ctc_segmentation/requirements.txt \
+".[all]"
+
+# Megatron Core installation
+git clone https://github.com/NVIDIA/Megatron-LM.git && \
+pushd Megatron-LM && \
+git checkout ${MCORE_TAG} && \
+  pushd megatron/core/datasets && \
+  make && \
+  popd && \
+popd
+export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
+EOF
+
+# Copy over NeMo code
+COPY ./ ./
+RUN <<"EOF" bash -ex
+pip install --no-cache-dir --no-build-isolation ".[all]"
+
+# set permission
+chmod 777 -R /workspace
+EOF
+
+ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
+
diff --git a/docs/source/core/core_index.rst → docs/source/apis.rst b/docs/source/core/core_index.rst → docs/source/apis.rst
@@ -14,14 +14,26 @@ You can learn more about aspects of the NeMo "core" by following the links below
    :name: core
    :titlesonly:
 
-   core
-   neural_modules
-   exp_manager
-   neural_types
-   export
-   adapters/intro
-   api
+   core/core
+   core/neural_modules
+   core/exp_manager
+   core/neural_types
+   core/export
+   core/adapters/intro
 
+You can learn more about aspects of the NeMo APIs by following the links below:
+
+.. toctree::
+   :maxdepth: 1
+   :name: API
+   :titlesonly:
+
+   core/api
+   common/intro
+   nlp/api
+   multimodal/api
+   asr/api
+   tts/api
 
 
 Alternatively, you can jump straight to the documentation for the individual collections:

diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst
@@ -1,5 +1,5 @@
-NeMo ASR Collection API
-=======================
+NeMo ASR API
+============
 
 
 Model Classes

diff --git a/docs/source/asr/ssl/intro.rst b/docs/source/asr/ssl/intro.rst
@@ -1,5 +1,5 @@
-Self-Supervised Learning
-=================================
+Speech Self-Supervised Learning
+===============================
 
 Self-Supervised Learning (SSL) refers to the problem of learning without explicit labels. As 
 any learning process require feedback, without explit labels, SSL derives supervisory signals from 

diff --git a/docs/source/collections.rst b/docs/source/collections.rst
@@ -11,26 +11,9 @@ Documentation for the individual collections
    :titlesonly:
 
    nlp/nemo_megatron/intro
-   nlp/models
    nlp/machine_translation/machine_translation
    nlp/megatron_onnx_export
    nlp/quantization
-   nlp/api
-
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Speech AI
-   :name: Speech AI
-   :titlesonly:
-
-   asr/intro
-   asr/speech_classification/intro
-   asr/speaker_recognition/intro
-   asr/speaker_diarization/intro
-   asr/ssl/intro
-   asr/speech_intent_slot/intro
-
 
 .. toctree::
    :maxdepth: 1
@@ -42,29 +25,32 @@ Documentation for the individual collections
    multimodal/vlm/intro
    multimodal/text2img/intro
    multimodal/nerf/intro
-   multimodal/api
-
 
 .. toctree::
    :maxdepth: 1
-   :caption: Text To Speech (TTS)
-   :name: Text To Speech
+   :caption: Vision (CV)
+   :name: vision
    :titlesonly:
 
-   tts/intro
+   vision/intro
 
 .. toctree::
    :maxdepth: 1
-   :caption: Vision (CV)
-   :name: vision
+   :caption: Speech AI
+   :name: Speech AI
    :titlesonly:
 
-   vision/intro
+   asr/intro
+   asr/speech_classification/intro
+   asr/speaker_recognition/intro
+   asr/speaker_diarization/intro
+   asr/ssl/intro
+   asr/speech_intent_slot/intro
 
 .. toctree::
    :maxdepth: 1
-   :caption: Common
-   :name: Common
+   :caption: Text To Speech (TTS)
+   :name: Text To Speech
    :titlesonly:
 
-   common/intro
+   tts/intro
diff --git a/docs/source/common/intro.rst b/docs/source/common/intro.rst
@@ -1,5 +1,5 @@
-Common Collection
-=================
+NeMo Common Collection API
+==========================
 
 The common collection contains things that could be used across all collections.
 

diff --git a/docs/source/core/api.rst b/docs/source/core/api.rst
@@ -1,6 +1,6 @@
 
-Core APIs
-=========
+NeMo Core APIs
+==============
 
 Base class for all NeMo models
 ------------------------------

diff --git a/docs/source/features/memory_optimizations.rst b/docs/source/features/memory_optimizations.rst
@@ -11,7 +11,7 @@ Flash Attention
 Overview
 ^^^^^^^^
 
-Flash Attention is a method designed to enhance the efficiency of Transformer models, which are widely utilized in applications such as Natural Language Processing (NLP). Traditional Transformers are slow and consume a lot of memory, especially with long sequences, due to the quadratic time and memory complexity of self-attention. FlashAttention, an IO-aware exact attention algorithm that leverages tiling to minimize the number of memory reads/writes between the GPU's high bandwidth memory (HBM) and on-chip SRAM. This approach is designed to be more efficient in terms of IO complexity compared to standard attention mechanisms.
+Flash Attention is a method designed to enhance the efficiency of Transformer models, which are widely utilized in applications such as Natural Language Processing (NLP). Traditional Transformers are slow and consume a lot of memory, especially with long sequences, due to the quadratic time and memory complexity of self-attention. Flash Attention is an IO-aware exact attention algorithm that leverages tiling to minimize the number of memory reads/writes between the GPU's high-bandwidth memory (HBM) and on-chip SRAM. This approach is designed to be more efficient in terms of IO complexity compared to standard attention mechanisms.
 
 Turn Flash Attention On and Off
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -38,4 +38,58 @@ Selective Activation Recomputation
 """"""""""""""""""""""""""""""""""
 This method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost.
 
-Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198
+Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198.
+
+Multi-query Attention (MQA) and Grouped-query Attention (GQA)
+-------------------------------------------------------------
+
+**Multi-query Attention (MQA)** and **Grouped-query Attention (GQA)** are modifications of the traditional multihead attention mechanism in Transformer models. These methods improve the efficiency and effectiveness of attention mechanisms.
+
+Overview
+^^^^^^^^
+
+**Multi-query Attention (MQA)**
+    MQA treats all attention heads as a single group, reducing computational complexity and accelerating training times. It is beneficial when model scalability or limited computational resources are concerns.
+
+**Grouped-query Attention (GQA)**
+    GQA groups the heads into clusters, each processing a subset of queries independently. This method balances the detailed focus of traditional multihead attention with the broad approach of MQA, enhancing nuanced input data processing.
+
+These attention variants offer:
+
+- **Reduced computational load**: Both methods decrease computation, beneficial for large models.
+- **Increased processing speed**: Simplifying attention leads to faster training and inference.
+- **Flexibility and adaptability**: Adjustments can be made based on task needs or hardware constraints.
+
+Enable MQA and GQA
+^^^^^^^^^^^^^^^^^^
+
+To use MQA or GQA in the NeMo Framework, adjust the ``num_query_groups`` parameter in the model configuration:
+
+1. **For Multi-query Attention (MQA)**:
+   - Set ``num_query_groups`` to `1` to treat all attention heads as a single group.
+
+   .. code-block:: yaml
+
+       num_query_groups: 1  # Enables Multi-query Attention
+
+2. **For Grouped-query Attention (GQA)**:
+   - Set ``num_query_groups`` to a number that is a divisor of the total number of attention heads (more than one but less than the total heads).
+
+   .. code-block:: yaml
+
+       num_query_groups: <number_of_groups>  # Enables Grouped-query Attention
+
+   - For regular attention, set this parameter to `None` or match it with the number of heads.
+
+   .. code-block:: yaml
+
+       num_query_groups: null  # Default setting for regular multihead attention
+
+Adjust the ``num_query_groups`` to explore different attention mechanisms and optimize your model's performance based on specific needs.
+
+Implement MQA or GQA
+^^^^^^^^^^^^^^^^^^^^
+
+NeMo's support for GQA and MQA is enabled through the integration of Megatron Core's Attention mechanism. The underlying implementation details can be explored within the Attention class of Megatron Core, which provides the functional backbone for these advanced attention methods. To understand the specific modifications and implementations of MQA and GQA, refer to the source code in the Attention class:
+
+Check implementation details from Attention Class in Megatron Core Repo: https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/attention.py#L49