From 90d82dce00b521ef2749e8ee770f15796bc79a63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 26 Oct 2024 00:22:09 +0200 Subject: [PATCH] ci: Switch to CPU only runner (#11035) * ci: Switch to CPU only runner Signed-off-by: Oliver Koenig * runner Signed-off-by: Oliver Koenig * Raise EnvironmentError when CUDA is unavailable to prevent cuda-python failure Signed-off-by: Vladimir Bataev * Fix error message Signed-off-by: Vladimir Bataev * update Signed-off-by: Oliver Koenig * fix Signed-off-by: Oliver Koenig * finalize Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig Signed-off-by: Vladimir Bataev Co-authored-by: Vladimir Bataev --- .github/workflows/_test_template.yml | 7 ++++++- .github/workflows/cicd-main.yml | 2 +- .../asr/parts/submodules/rnnt_greedy_decoding.py | 4 ++-- .../asr/parts/submodules/rnnt_loop_labels_computer.py | 4 ++-- .../asr/parts/submodules/tdt_loop_labels_computer.py | 4 ++-- nemo/core/utils/cuda_python_utils.py | 9 +++++++-- 6 files changed, 20 insertions(+), 10 deletions(-) diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index c0aedbc1524e..17cceb665747 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -51,7 +51,12 @@ jobs: - name: Start container run: | - docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))" + ARG=("") + if [[ "${{ inputs.RUNNER }}" != *cpu* ]]; then + ARG=("--runtime=nvidia --gpus all") + fi + + docker run --rm -d --name nemo_container_${{ github.run_id }} ${ARG[@]} --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))" - id: main name: Run main script diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index d5b4d2d8081e..098b9d635cb3 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -300,7 +300,7 @@ jobs: uses: ./.github/workflows/_test_template.yml if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true' with: - RUNNER: self-hosted-azure-cpu + RUNNER: self-hosted-azure TIMEOUT: 20 SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py index 996ac8977f35..f9cf368fe405 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py @@ -646,9 +646,9 @@ def __init__( ) self._greedy_decode = RNNTGreedyDecodeCudaGraph(max_symbols_per_step, self) - except (ImportError, ModuleNotFoundError, ValueError) as e: + except (ImportError, ModuleNotFoundError, ValueError, EnvironmentError) as e: self.use_cuda_graph_decoder = False - logging.warning(f"Cannot use decoder with CUDA graphs, reason: {e.msg}") + logging.warning(f"Cannot use decoder with CUDA graphs, reason: {e}") self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames else: self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py index c0783c301c44..13bb0b471ed2 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py +++ b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py @@ -266,11 +266,11 @@ def maybe_enable_cuda_graphs(self): try: check_cuda_python_cuda_graphs_conditional_nodes_supported() self.cuda_graphs_mode = self.CudaGraphsMode.FULL_GRAPH - except (ImportError, ModuleNotFoundError) as e: + except (ImportError, ModuleNotFoundError, EnvironmentError) as e: logging.warning( "No conditional node support for Cuda.\n" "Cuda graphs with while loops are disabled, decoding speed will be slower\n" - f"Reason: {e.msg}" + f"Reason: {e}" ) self.cuda_graphs_mode = self.CudaGraphsMode.NO_WHILE_LOOPS self.reset_cuda_graphs_state() diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py index 61bd71f09037..c0fbe5361761 100644 --- a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py +++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py @@ -277,11 +277,11 @@ def maybe_enable_cuda_graphs(self): try: check_cuda_python_cuda_graphs_conditional_nodes_supported() self.cuda_graphs_mode = self.CudaGraphsMode.FULL_GRAPH - except (ImportError, ModuleNotFoundError) as e: + except (ImportError, ModuleNotFoundError, EnvironmentError) as e: logging.warning( "No conditional node support for Cuda.\n" "Cuda graphs with while loops are disabled, decoding speed will be slower\n" - f"Reason: {e.msg}" + f"Reason: {e}" ) self.cuda_graphs_mode = self.CudaGraphsMode.NO_WHILE_LOOPS self.reset_cuda_graphs_state() diff --git a/nemo/core/utils/cuda_python_utils.py b/nemo/core/utils/cuda_python_utils.py index eb8897df0797..8bd25333488f 100644 --- a/nemo/core/utils/cuda_python_utils.py +++ b/nemo/core/utils/cuda_python_utils.py @@ -22,6 +22,10 @@ def check_cuda_python_cuda_graphs_conditional_nodes_supported(): + # for CPU-only environment we need to raise an exception, otherwise cuda-python library will fail + if not torch.cuda.is_available(): + raise EnvironmentError("CUDA is not available") + try: from cuda import cuda except ImportError: @@ -55,11 +59,12 @@ def skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported(): """ try: check_cuda_python_cuda_graphs_conditional_nodes_supported() - except (ImportError, ModuleNotFoundError) as e: + except (ImportError, ModuleNotFoundError, EnvironmentError) as e: import pytest pytest.skip( - f"Test using cuda graphs with conditional nodes is being skipped because cuda graphs with conditional nodes aren't supported. Error message: {e}" + "Test using cuda graphs with conditional nodes is being skipped because " + f"cuda graphs with conditional nodes aren't supported. Error message: {e}" )