Akoumparouli/nemo ux test fixes (#10641)

* NeMoLogger arg rename dir -> log_dir Signed-off-by: Alexandros Koumparoulis <[email protected]> * Set NVTE_APPLY_QK_LAYER_SCALING=0 Signed-off-by: Alexandros Koumparoulis <[email protected]> * call mbs reconfigure to avoid interference from other tests; rewrite os.environ :( to avoid interference from other tests Signed-off-by: Alexandros Koumparoulis <[email protected]> * Apply isort and black reformatting Signed-off-by: akoumpa <[email protected]> --------- Signed-off-by: Alexandros Koumparoulis <[email protected]> Signed-off-by: akoumpa <[email protected]> Co-authored-by: akoumpa <[email protected]>
NVIDIA · Oct 4, 2024 · 7877766 · 7877766
1 parent f648e09
commit 7877766
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 13 deletions.
diff --git a/tests/lightning/test_dist_ckpt.py b/tests/lightning/test_dist_ckpt.py
@@ -14,7 +14,11 @@
 
 import os
 
-os.environ['NVTE_APPLY_QK_LAYER_SCALING'] = '1'
+
+def set_env():
+    os.environ['NVTE_APPLY_QK_LAYER_SCALING'] = '1'
+
+
 from pathlib import Path
 
 import pytest
@@ -70,6 +74,8 @@ class TestDistCkptIO:
     @pytest.mark.run_only_on('GPU')
     def test_dist_ckpt_io_called_for_mcore_models(self, tmp_path):
 
+        set_env()
+        assert os.environ['NVTE_APPLY_QK_LAYER_SCALING'] == '1'
         model, data = get_model_and_data()
 
         strategy = _get_strategy()
@@ -92,10 +98,12 @@ def test_dist_ckpt_io_called_for_mcore_models(self, tmp_path):
         assert len(ckpts) == 1
         ckpt = ckpts[0]
         assert str(ckpt) == _get_last_checkpoint_dir(model)
+        trainer._teardown()
 
     @pytest.mark.run_only_on('GPU')
     def test_async_save_produces_same_checkpoints_as_sync(self, tmp_path):
-
+        set_env()
+        assert os.environ['NVTE_APPLY_QK_LAYER_SCALING'] == '1'
         model, data = get_model_and_data()
 
         sync_ckpt_dir = tmp_path / 'sync_checkpoints'
@@ -156,9 +164,11 @@ def test_async_save_produces_same_checkpoints_as_sync(self, tmp_path):
         for k in sync_state_dict['sharded_state_dict'].keys():
             if isinstance(sync_state_dict['sharded_state_dict'][k], torch.Tensor):
                 assert torch.all(sync_state_dict['sharded_state_dict'][k] == async_state_dict['sharded_state_dict'][k])
+        dummy_trainer._teardown()
 
     def test_sharded_strategies(self):
-
+        set_env()
+        assert os.environ['NVTE_APPLY_QK_LAYER_SCALING'] == '1'
         model_checkpoint = nl.ModelCheckpoint()
 
         strategy = nl.MegatronStrategy(
@@ -181,3 +191,4 @@ def test_sharded_strategies(self):
         assert base_checkpoint_io.save_ckpt_format == 'torch_dist'
         assert base_checkpoint_io.parallel_save
         assert base_checkpoint_io.load_directly_on_device == False
+        trainer._teardown()
diff --git a/tests/lightning/test_nemo_resume_from_ckpt.py b/tests/lightning/test_nemo_resume_from_ckpt.py
@@ -11,17 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import os
 
-os.environ['NVTE_FLASH_ATTN'] = '0'
-os.environ['NVTE_FUSED_ATTN'] = '0'
+import pytest
+
+
+def set_env():
+    os.environ['NVTE_FLASH_ATTN'] = '0'
+    os.environ['NVTE_FUSED_ATTN'] = '0'
+    os.environ['NVTE_APPLY_QK_LAYER_SCALING'] = '0'
+
 
 import sys
 from pathlib import Path
 
 import pytest
 import torch
+from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator
 from megatron.core.optimizer import OptimizerConfig
 
 import nemo.lightning as nl
@@ -58,7 +64,7 @@ def load_dcp(ckpt_dir, torch_tensor=True):
     dcp.load(
         state_dict,
         storage_reader=fs_reader,
-        no_dist=True,
+        # no_dist=True,
     )
     return state_dict
 
@@ -84,7 +90,7 @@ def compare_ckpts(a, b, path=[]):
         raise ValueError("Unexpected value type " + str(type(a)))
 
 
-def setup_data_model_optim(log_dir, n_steps, data_path):
+def setup_data_model_optim(log_dir, n_steps, data_path, gbs=2, mbs=1):
     seq_length = 2048
     tokenizer = get_nmt_tokenizer(
         "megatron",
@@ -96,13 +102,20 @@ def setup_data_model_optim(log_dir, n_steps, data_path):
     data = PreTrainingDataModule(
         paths=data_path,
         seq_length=2048,
-        micro_batch_size=1,
-        global_batch_size=2,
+        micro_batch_size=mbs,
+        global_batch_size=gbs,
         seed=1234,
         tokenizer=tokenizer,
         split='9999,1,1',
     )
-
+    # Other tests might have different configs, so need to configure explicitly.
+    reconfigure_num_microbatches_calculator(
+        0,
+        None,
+        gbs,
+        mbs,
+        data_parallel_size=1,
+    )
     gpt_config = llm.GPTConfig(
         num_layers=2,
         hidden_size=128,
@@ -248,6 +261,11 @@ def train(n_steps, resume):
             )
             trainer._teardown()
 
+        set_env()
+        assert os.environ['NVTE_FLASH_ATTN'] == '0'
+        assert os.environ['NVTE_FUSED_ATTN'] == '0'
+        assert os.environ['NVTE_APPLY_QK_LAYER_SCALING'] == '0'
+
         # Train for 40 steps
         train(
             40,
@@ -289,3 +307,4 @@ def train(n_steps, resume):
 
         # Verify ckpt contents
         compare_ckpts(ckpts[0], ckpts[1])
+        teardown()
diff --git a/tests/lightning/test_state_restoration.py b/tests/lightning/test_state_restoration.py
@@ -146,7 +146,7 @@ def run_train_from_scratch():
         data=data,
         trainer=trainer,
         log=NeMoLogger(
-            dir=EXP_DIR,
+            log_dir=EXP_DIR,
         ),
         tokenizer='data',
         optim=opt,
@@ -171,7 +171,7 @@ def run_resume_train():
         data=data,
         trainer=trainer,
         log=NeMoLogger(
-            dir=EXP_DIR,
+            log_dir=EXP_DIR,
         ),
         tokenizer='data',
         optim=opt,