Skip to content

Commit

Permalink
ci: Switch to CPU only runner (#11035)
Browse files Browse the repository at this point in the history
* ci: Switch to CPU only runner

Signed-off-by: Oliver Koenig <[email protected]>

* runner

Signed-off-by: Oliver Koenig <[email protected]>

* Raise EnvironmentError when CUDA is unavailable to prevent cuda-python failure

Signed-off-by: Vladimir Bataev <[email protected]>

* Fix error message

Signed-off-by: Vladimir Bataev <[email protected]>

* update

Signed-off-by: Oliver Koenig <[email protected]>

* fix

Signed-off-by: Oliver Koenig <[email protected]>

* finalize

Signed-off-by: Oliver Koenig <[email protected]>

---------

Signed-off-by: Oliver Koenig <[email protected]>
Signed-off-by: Vladimir Bataev <[email protected]>
Co-authored-by: Vladimir Bataev <[email protected]>
  • Loading branch information
ko3n1g and artbataev authored Oct 25, 2024
1 parent 9d857ba commit 90d82dc
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 10 deletions.
7 changes: 6 additions & 1 deletion .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,12 @@ jobs:
- name: Start container
run: |
docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
ARG=("")
if [[ "${{ inputs.RUNNER }}" != *cpu* ]]; then
ARG=("--runtime=nvidia --gpus all")
fi
docker run --rm -d --name nemo_container_${{ github.run_id }} ${ARG[@]} --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
- id: main
name: Run main script
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ jobs:
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
RUNNER: self-hosted-azure
TIMEOUT: 20
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
Expand Down
4 changes: 2 additions & 2 deletions nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,9 +646,9 @@ def __init__(
)

self._greedy_decode = RNNTGreedyDecodeCudaGraph(max_symbols_per_step, self)
except (ImportError, ModuleNotFoundError, ValueError) as e:
except (ImportError, ModuleNotFoundError, ValueError, EnvironmentError) as e:
self.use_cuda_graph_decoder = False
logging.warning(f"Cannot use decoder with CUDA graphs, reason: {e.msg}")
logging.warning(f"Cannot use decoder with CUDA graphs, reason: {e}")
self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
else:
self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,11 @@ def maybe_enable_cuda_graphs(self):
try:
check_cuda_python_cuda_graphs_conditional_nodes_supported()
self.cuda_graphs_mode = self.CudaGraphsMode.FULL_GRAPH
except (ImportError, ModuleNotFoundError) as e:
except (ImportError, ModuleNotFoundError, EnvironmentError) as e:
logging.warning(
"No conditional node support for Cuda.\n"
"Cuda graphs with while loops are disabled, decoding speed will be slower\n"
f"Reason: {e.msg}"
f"Reason: {e}"
)
self.cuda_graphs_mode = self.CudaGraphsMode.NO_WHILE_LOOPS
self.reset_cuda_graphs_state()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,11 +277,11 @@ def maybe_enable_cuda_graphs(self):
try:
check_cuda_python_cuda_graphs_conditional_nodes_supported()
self.cuda_graphs_mode = self.CudaGraphsMode.FULL_GRAPH
except (ImportError, ModuleNotFoundError) as e:
except (ImportError, ModuleNotFoundError, EnvironmentError) as e:
logging.warning(
"No conditional node support for Cuda.\n"
"Cuda graphs with while loops are disabled, decoding speed will be slower\n"
f"Reason: {e.msg}"
f"Reason: {e}"
)
self.cuda_graphs_mode = self.CudaGraphsMode.NO_WHILE_LOOPS
self.reset_cuda_graphs_state()
Expand Down
9 changes: 7 additions & 2 deletions nemo/core/utils/cuda_python_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@


def check_cuda_python_cuda_graphs_conditional_nodes_supported():
# for CPU-only environment we need to raise an exception, otherwise cuda-python library will fail
if not torch.cuda.is_available():
raise EnvironmentError("CUDA is not available")

try:
from cuda import cuda
except ImportError:
Expand Down Expand Up @@ -55,11 +59,12 @@ def skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported():
"""
try:
check_cuda_python_cuda_graphs_conditional_nodes_supported()
except (ImportError, ModuleNotFoundError) as e:
except (ImportError, ModuleNotFoundError, EnvironmentError) as e:
import pytest

pytest.skip(
f"Test using cuda graphs with conditional nodes is being skipped because cuda graphs with conditional nodes aren't supported. Error message: {e}"
"Test using cuda graphs with conditional nodes is being skipped because "
f"cuda graphs with conditional nodes aren't supported. Error message: {e}"
)


Expand Down

0 comments on commit 90d82dc

Please sign in to comment.