Merge branch 'main' into vchen/neva-blend-data

xuanzic · Aug 1, 2024 · 8af0b31 · 8af0b31
2 parents 4ddd2d9 + e5b0fef
commit 8af0b31
Show file tree

Hide file tree

Showing 149 changed files with 23,029 additions and 2,179 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1,2 @@
+.github/ @pablo-garay @ko3n1g
+Dockerfile.ci @pablo-garay @ko3n1g
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -34,7 +34,7 @@ WORKDIR /workspace
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.13.0
-ARG MCORE_TAG=c7a1f82d761577e6ca0338d3521eac82f2aa0904
+ARG MCORE_TAG=2bbe55be32e2d478c4b2ce575af1cccb8fc3d9b9
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \
@@ -90,4 +90,3 @@ chmod 777 -R /workspace
 EOF
 
 ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
-
diff --git a/Dockerfile.speech b/Dockerfile.speech
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
 
 # build an image that includes only the nemo dependencies, ensures that dependencies
 # are included first for optimal caching, and useful for building a development
@@ -62,23 +62,28 @@ RUN apt-get update && \
   rm -rf /var/lib/apt/lists/*
 
 WORKDIR /workspace/
+
+ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
+ARG MCORE_TAG=338af51452a53982d202e8386db6233adad1ce86
+ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 # Install megatron core, this can be removed once 0.3 pip package is released
 # We leave it here in case we need to work off of a specific commit in main
 RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
   cd Megatron-LM && \
-  git checkout c7a1f82d761577e6ca0338d3521eac82f2aa0904 && \
+  git checkout ${MCORE_TAG} && \
   pip install .
 
 # Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771
 RUN git clone https://github.com/NVIDIA/apex.git && \
   cd apex && \
-  git checkout f058162b215791b15507bb542f22ccfde49c872d && \
-  pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
+  git checkout ${APEX_TAG} && \
+  pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir \
+    --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
 
 # Transformer Engine 1.2.0
 RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \
   cd TransformerEngine && \
-  git fetch origin da30634a6c9ccdbb6c587b6c93b1860e4b038204 && \
+  git fetch origin ${TE_TAG} && \
   git checkout FETCH_HEAD && \
   git submodule init && git submodule update && \
   NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .
@@ -126,7 +131,9 @@ RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_k2.sh); INSTALL
 WORKDIR /tmp/nemo
 ENV LHOTSE_REQUIRE_TORCHAUDIO=0
 COPY requirements .
-RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
+# exclude requirements_vllm.txt, since `vllm==0.5.x` breaks the container due to hardcoded requirements `torch==2.3.0`
+RUN for f in $(ls requirements*.txt | grep -v 'requirements_vllm.txt'); do \
+    pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
 
 # install flash attention
 RUN pip install flash-attn
@@ -151,7 +158,12 @@ RUN /usr/bin/test -n "$NEMO_VERSION" && \
 RUN --mount=from=nemo-src,target=/tmp/nemo,rw cd /tmp/nemo && pip install ".[all]"
 
 # Check install
-RUN python -c "import nemo.collections.nlp as nemo_nlp" && \
+# NB: adjusting LD_LIBRARY_PATH (only here, should not be persistent!) is a temporary hack
+# to avoid failure if CUDA is unavailable (`docker build` does not expose GPUs)
+# The error is raised in NeMo Core, and the main reason is reinstalled Transformer-Engine;
+RUN export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${CUDA_HOME}/compat/lib.real && \
+  python -c "import nemo.collections.asr as nemo_asr" && \
+  python -c "import nemo.collections.nlp as nemo_nlp" && \
   python -c "import nemo.collections.tts as nemo_tts" && \
   python -c "import nemo_text_processing.text_normalization as text_normalization"
 

diff --git a/README.md b/README.md
@@ -10,10 +10,38 @@
 # **NVIDIA NeMo Framework**
 
 ## Latest News
+
 <!-- markdownlint-disable -->
 <details open>
-  <summary><b>Large Language Models and Multimodal</b></summary>
+  <summary><b>Large Language Models and Multimodal Models</b></summary>
+      <details>
+      <summary>
+        <a href="https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/llama/index.html#new-llama-3-1-support for more information/">
+        New Llama 3.1 Support
+        </a> (2024-07-23)
+      </summary>
+        The NeMo Framework now supports training and customizing the Llama 3.1 collection of LLMs from Meta.
+      <br><br>
+    </details>
     <details>
+      <summary>
+        <a href="https://aws.amazon.com/blogs/machine-learning/accelerate-your-generative-ai-distributed-training-workloads-with-the-nvidia-nemo-framework-on-amazon-eks/">
+          Accelerate your Generative AI Distributed Training Workloads with the NVIDIA NeMo Framework on Amazon EKS
+        </a> (2024-07-16)
+      </summary>
+     NVIDIA NeMo Framework now runs distributed training workloads on an Amazon Elastic Kubernetes Service (Amazon EKS) cluster. For step-by-step instructions on creating an EKS cluster and running distributed training workloads with NeMo, see the GitHub repository <a href="https://github.com/aws-samples/awsome-distributed-training/tree/main/3.test_cases/2.nemo-launcher/EKS/"> here.</a>
+      <br><br>
+    </details>
+    <details>
+      <summary>
+        <a href="https://developer.nvidia.com/blog/nvidia-nemo-accelerates-llm-innovation-with-hybrid-state-space-model-support/">
+          NVIDIA NeMo Accelerates LLM Innovation with Hybrid State Space Model Support
+        </a> (2024/06/17)
+      </summary>
+     NVIDIA NeMo and Megatron Core now support pre-training and fine-tuning of state space models (SSMs). NeMo also supports training models based on the Griffin architecture as described by Google DeepMind. 
+      <br><br>
+    </details>
+      <details>
       <summary>
         <a href="https://huggingface.co/models?sort=trending&search=nvidia%2Fnemotron-4-340B">
           NVIDIA releases 340B base, instruct, and reward models pretrained on a total of 9T tokens.
@@ -46,45 +74,6 @@
         The walkthrough includes detailed instructions on how to set up a Google Cloud Project and pre-train a GPT model using the NeMo Framework.
         <br><br>
       </details>
-    <details>
-      <summary>
-        <a href="https://blogs.nvidia.com/blog/bria-builds-responsible-generative-ai-using-nemo-picasso/">
-          Bria Builds Responsible Generative AI for Enterprises Using NVIDIA NeMo, Picasso
-        </a> (2024/03/06)
-      </summary>
-      Bria, a Tel Aviv startup at the forefront of visual generative AI for enterprises now leverages the NVIDIA NeMo Framework. 
-      The Bria.ai platform uses reference implementations from the NeMo Multimodal collection, trained on NVIDIA Tensor Core GPUs, to enable high-throughput and low-latency image generation. 
-      Bria has also adopted NVIDIA Picasso, a foundry for visual generative AI models, to run inference.
-      <br><br>
-    </details>
-    <details>
-      <summary>
-        <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility/">
-          New NVIDIA NeMo Framework Features and NVIDIA H200
-        </a> (2023/12/06)
-      </summary>
-      NVIDIA NeMo Framework now includes several optimizations and enhancements, 
-      including: 
-      1) Fully Sharded Data Parallelism (FSDP) to improve the efficiency of training large-scale AI models, 
-      2) Mix of Experts (MoE)-based LLM architectures with expert parallelism for efficient LLM training at scale, 
-      3) Reinforcement Learning from Human Feedback (RLHF) with TensorRT-LLM for inference stage acceleration, and 
-      4) up to 4.2x speedups for Llama 2 pre-training on NVIDIA H200 Tensor Core GPUs.
-      <br><br>
-      <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility">
-      <img src="https://github.com/sbhavani/TransformerEngine/blob/main/docs/examples/H200-NeMo-performance.png" alt="H200-NeMo-performance" style="width: 600px;"></a>
-      <br><br>
-    </details>
-    <details>
-      <summary>
-        <a href="https://blogs.nvidia.com/blog/nemo-amazon-titan/">
-          NVIDIA now powers training for Amazon Titan Foundation models
-        </a> (2023/11/28)
-      </summary>
-      NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). 
-      The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. 
-      The NeMo Framework provides a versatile framework for building, customizing, and running LLMs.
-      <br><br>
-    </details>
 </details>
 
 <details open>
@@ -604,6 +593,53 @@ to the `gh-pages-src` branch of this repository. For detailed
 information, please consult the README located at the [gh-pages-src
 branch](https://github.com/NVIDIA/NeMo/tree/gh-pages-src#readme).
 
+## Blogs
+
+<!-- markdownlint-disable -->
+<details open>
+  <summary><b>Large Language Models and Multimodal Models</b></summary>
+    <details>
+      <summary>
+        <a href="https://blogs.nvidia.com/blog/bria-builds-responsible-generative-ai-using-nemo-picasso/">
+          Bria Builds Responsible Generative AI for Enterprises Using NVIDIA NeMo, Picasso
+        </a> (2024/03/06)
+      </summary>
+      Bria, a Tel Aviv startup at the forefront of visual generative AI for enterprises now leverages the NVIDIA NeMo Framework. 
+      The Bria.ai platform uses reference implementations from the NeMo Multimodal collection, trained on NVIDIA Tensor Core GPUs, to enable high-throughput and low-latency image generation. 
+      Bria has also adopted NVIDIA Picasso, a foundry for visual generative AI models, to run inference.
+      <br><br>
+    </details>
+    <details>
+      <summary>
+        <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility/">
+          New NVIDIA NeMo Framework Features and NVIDIA H200
+        </a> (2023/12/06)
+      </summary>
+      NVIDIA NeMo Framework now includes several optimizations and enhancements, 
+      including: 
+      1) Fully Sharded Data Parallelism (FSDP) to improve the efficiency of training large-scale AI models, 
+      2) Mix of Experts (MoE)-based LLM architectures with expert parallelism for efficient LLM training at scale, 
+      3) Reinforcement Learning from Human Feedback (RLHF) with TensorRT-LLM for inference stage acceleration, and 
+      4) up to 4.2x speedups for Llama 2 pre-training on NVIDIA H200 Tensor Core GPUs.
+      <br><br>
+      <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility">
+      <img src="https://github.com/sbhavani/TransformerEngine/blob/main/docs/examples/H200-NeMo-performance.png" alt="H200-NeMo-performance" style="width: 600px;"></a>
+      <br><br>
+    </details>
+    <details>
+      <summary>
+        <a href="https://blogs.nvidia.com/blog/nemo-amazon-titan/">
+          NVIDIA now powers training for Amazon Titan Foundation models
+        </a> (2023/11/28)
+      </summary>
+      NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). 
+      The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. 
+      The NeMo Framework provides a versatile framework for building, customizing, and running LLMs.
+      <br><br>
+    </details>
+</details>
+<!-- markdownlint-enable -->
+
 ## Licenses
 
 - [NeMo GitHub Apache 2.0

diff --git a/docs/source/collections.rst b/docs/source/collections.rst
@@ -25,6 +25,7 @@ Documentation for the individual collections
    multimodal/vlm/intro
    multimodal/text2img/intro
    multimodal/nerf/intro
+   mumtimoda/speech_llm/intro
 
 .. toctree::
    :maxdepth: 1

diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst
@@ -248,48 +248,6 @@ You might also want to adjust the callback parameters:
 
 Straggler detection might involve inter-rank synchronization, and should be invoked with reasonable frequency (e.g. every few minutes).
 
-.. _exp_manager_straggler_det_support-label:
-
-.. note::
-    Stragglers Detection feature is included in the optional NeMo resiliency package.
-
-Distributed training can be affected by stragglers, which are slow workers that slow down the overall training process. 
-NeMo provides a straggler detection feature that can identify slower GPUs.
-
-This feature is implemented in the ``StragglerDetectionCallback``, which is disabled by default.
-
-The callback computes normalized GPU performance scores, which are scalar values ranging from 0.0 (worst) to 1.0 (best). 
-A performance score can be interpreted as the ratio of current performance to reference performance.
-
-There are two types of performance scores provided by the callback:
-    - Relative GPU performance score: The best-performing GPU in the workload is used as a reference.
-    - Individual GPU performance score: The best historical performance of the GPU is used as a reference.
-
-Examples:
-    - If the relative performance score is 0.5, it means that a GPU is twice slower than the fastest GPU.
-    - If the individual performance score is 0.5, it means that a GPU is twice slower than its best observed performance.
-
-If a GPU performance score drops below the specified threshold, it is identified as a straggler.
-
-To enable straggler detection, add ``create_straggler_detection_callback: True`` under exp_manager in the config YAML file. 
-You might also want to adjust the callback parameters:
-
-.. code-block:: yaml
-
-    exp_manager:
-        ...
-        create_straggler_detection_callback: True
-        straggler_detection_callback_params:
-            report_time_interval: 300      # Interval [seconds] of the straggler check
-            calc_relative_gpu_perf: True   # Calculate relative GPU performance
-            calc_individual_gpu_perf: True # Calculate individual GPU performance
-            num_gpu_perf_scores_to_log: 5       # Log 5 best and 5 worst GPU performance scores, even if no stragglers are detected
-            gpu_relative_perf_threshold: 0.7    # Threshold for relative GPU performance scores
-            gpu_individual_perf_threshold: 0.7  # Threshold for individual GPU performance scores
-            stop_if_detected: True              # Terminate the workload if stragglers are detected
-
-Straggler detection might involve inter-rank synchronization, and should be invoked with reasonable frequency (e.g. every few minutes).
-
 Fault Tolerance
 ---------------
 
@@ -334,9 +292,10 @@ Timeouts for fault detection need to be adjusted for a given workload:
 checkpointing related operations should be taken into account.
 
 If ``calculate_timeouts: True`` timeouts will be automatically estimated based on observed intervals. 
-Estimated timeouts take precedence over timeouts defined in the config file. **Timeouts are estimated after 
-checkpoint loading and saving was observed**. For example, in multi-part training started from scratch, 
-estimated timeouts won't be available during the first run. Estimated timeouts are stored in the checkpoint. 
+Estimated timeouts take precedence over timeouts defined in the config file. **Timeouts are estimated 
+at the end of a training run, when checkpoint loading and saving were observed**. Hence, in a multi-part 
+training started from scratch, estimated timeouts won't be available during initial two runs. 
+Estimated timeouts are stored in a separate JSON file. 
 
 ``max_subsequent_job_failures`` allows for the automatic continuation of training on a SLURM cluster. 
 This feature requires SLURM job to be scheduled with ``NeMo-Framework-Launcher``. If ``max_subsequent_job_failures`` 
@@ -346,10 +305,12 @@ subsequent jobs failed (SLURM job exit code is `!= 0`) or the training is comple
 
 All FT configuration items summary:
     * ``workload_check_interval`` (float, default=5.0) Periodic workload check interval [seconds] in the workload monitor.
-    * ``initial_rank_heartbeat_timeout`` (Optional[float], default=60.0 * 60.0) Timeout for the first heartbeat from a rank. 
-    * ``rank_heartbeat_timeout`` (Optional[float], default=45.0 * 60.0) Timeout for subsequent heartbeats from a rank. 
+    * ``initial_rank_heartbeat_timeout`` (Optional[float], default=60.0 * 60.0) Timeout [seconds] for the first heartbeat from a rank. 
+    * ``rank_heartbeat_timeout`` (Optional[float], default=45.0 * 60.0) Timeout [seconds] for subsequent heartbeats from a rank. 
     * ``calculate_timeouts`` (bool, default=True) Try to calculate ``rank_heartbeat_timeout`` and ``initial_rank_heartbeat_timeout`` 
       based on the observed heartbeat intervals.
+    * ``safety_factor``: (float, default=5.0) When calculating the timeouts, multiply the maximum observed heartbeat interval 
+      by this factor to obtain the timeout estimate. Can be made smaller for stable environments and larger for unstable ones.  
     * ``rank_termination_signal`` (signal.Signals, default=signal.SIGKILL) Signal used to terminate the rank when failure is detected.
     * ``log_level`` (str, default='INFO') Log level for the FT client and server(rank monitor).
     * ``max_rank_restarts`` (int, default=0) Used by FT launcher. Max number of restarts for a rank.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		.github/ @pablo-garay @ko3n1g
		Dockerfile.ci @pablo-garay @ko3n1g