Skip to content

Commit

Permalink
Add torchrun local executor to recipes (#11342)
Browse files Browse the repository at this point in the history
* Add torchrun local executor to recipes

Signed-off-by: Marc Romeijn <[email protected]>

* Remove code from init

Signed-off-by: Marc Romeijn <[email protected]>

* Fix copyright

Signed-off-by: Marc Romeijn <[email protected]>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <[email protected]>

* Fix failing test

Signed-off-by: Marc Romeyn <[email protected]>

---------

Signed-off-by: Marc Romeijn <[email protected]>
Signed-off-by: marcromeyn <[email protected]>
Signed-off-by: Marc Romeyn <[email protected]>
Co-authored-by: marcromeyn <[email protected]>
  • Loading branch information
marcromeyn and marcromeyn authored Nov 21, 2024
1 parent ed05600 commit 34f7408
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 0 deletions.
2 changes: 2 additions & 0 deletions nemo/collections/llm/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
)
from nemo.collections.llm.recipes.log.default import default_log, default_resume
from nemo.collections.llm.recipes.optim import adam
from nemo.collections.llm.recipes.run.executor import torchrun

__all__ = [
"baichuan2_7b",
Expand Down Expand Up @@ -134,4 +135,5 @@
"adam",
"default_log",
"default_resume",
"torchrun",
]
13 changes: 13 additions & 0 deletions nemo/collections/llm/recipes/run/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
35 changes: 35 additions & 0 deletions nemo/collections/llm/recipes/run/executor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import nemo_run as run


@run.cli.factory
def torchrun(devices: int = 8) -> run.Config[run.LocalExecutor]:
"""Local executor using torchrun."""
env_vars = {
"TRANSFORMERS_OFFLINE": "1",
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
"NCCL_NVLS_ENABLE": "0",
"NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
"NVTE_ASYNC_AMAX_REDUCTION": "1",
}

executor = run.Config(
run.LocalExecutor,
ntasks_per_node=devices,
launcher="torchrun",
env_vars=env_vars,
)

return executor

0 comments on commit 34f7408

Please sign in to comment.