PTQ memory optimization (#11257)

* Initial commit Signed-off-by: Piotr Kaminski <[email protected]> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <[email protected]> * Add sample generate Signed-off-by: Piotr Kaminski <[email protected]> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <[email protected]> * Nemotron quantization, reduce diff Signed-off-by: Piotr Kaminski <[email protected]> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <[email protected]> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <[email protected]> * Reduce diff Signed-off-by: Piotr Kaminski <[email protected]> * code review suggestions Signed-off-by: Piotr Kaminski <[email protected]> * Bug fixes Signed-off-by: Piotr Kaminski <[email protected]> * remove not needed import Signed-off-by: Piotr Kaminski <[email protected]> * fix model type and allow ddp/optim setup Signed-off-by: Piotr Kaminski <[email protected]> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <[email protected]> --------- Signed-off-by: Piotr Kaminski <[email protected]> Signed-off-by: Laplasjan107 <[email protected]> Signed-off-by: Piotr Kamiński <[email protected]> Co-authored-by: Piotr Kaminski <[email protected]> Co-authored-by: Laplasjan107 <[email protected]> Co-authored-by: Jan Lasek <[email protected]>
NVIDIA · Nov 18, 2024 · a4c935f · a4c935f
1 parent 956b54d
commit a4c935f
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 20 deletions.
diff --git a/nemo/collections/llm/quantization/quantizer.py b/nemo/collections/llm/quantization/quantizer.py
@@ -272,9 +272,8 @@ def export(self, model: llm.GPTModel, model_dir: str) -> None:
         # TODO: Add sample generate
         # TODO: Support megatron_amp_O2
         export_dir = self.export_config.path
-        use_nfs_workspace = (model.trainer._fabric.__io__.num_nodes > 1) or (
-            model.config.pipeline_model_parallel_size > 1
-        )
+
+        use_nfs_workspace = model.config.pipeline_model_parallel_size > 1
         export_tensorrt_llm_checkpoint(
             model=get_unwrapped_mcore_model(model),
             decoder_type=self._get_decoder_type(model.config),
@@ -284,15 +283,17 @@ def export(self, model: llm.GPTModel, model_dir: str) -> None:
             inference_pipeline_parallel=self.export_config.inference_pipeline_parallel,
             use_nfs_workspace=use_nfs_workspace,
         )
+        dist.barrier()
 
         # Save the model context in order to restore its tokenizer later. The destination
         # path is "nemo_context" as this name is used in nemo.export to setup tokenizer.
-        shutil.copytree(
-            os.path.join(model_dir, CONTEXT_PATH),
-            os.path.join(export_dir, "nemo_context"),
-            dirs_exist_ok=True,
-        )
-        logging.info(f"Model context saved.")
+        if dist.get_rank() == 0:
+            shutil.copytree(
+                os.path.join(model_dir, CONTEXT_PATH),
+                os.path.join(export_dir, "nemo_context"),
+                dirs_exist_ok=True,
+            )
+            logging.info("Model context saved.")
 
         logging.info(f"Export succeeded, model has been exported to {export_dir}.")
 

diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py
@@ -18,6 +18,7 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.llm.inference.base import _setup_trainer_and_restore_model
 from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
 from nemo.utils import logging
 
@@ -42,25 +43,44 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig:
     return model_cfg
 
 
-def load_with_modelopt_layer_spec(nemo_checkpoint_path: str, calib_tp: int = 1, calib_pp: int = 1) -> llm.GPTModel:
+def load_with_modelopt_layer_spec(
+    nemo_checkpoint_path: str, calib_tp: int = 1, calib_pp: int = 1, inference_only: bool = True
+):
+    # TODO: setting ddp="pytorch" with manually deleting model.optim is a hackish way to disable DDP initialization. Needs a systematic solution.
+    if inference_only:
+        strategy = nl.MegatronStrategy(
+            tensor_model_parallel_size=calib_tp,
+            pipeline_model_parallel_size=calib_pp,
+            pipeline_dtype=torch.bfloat16,
+            ckpt_load_optimizer=False,
+            ckpt_parallel_save_optim=False,
+            setup_optimizers=False,
+            lazy_init=True,
+            ddp="pytorch",
+        )
+    else:
+        strategy = nl.MegatronStrategy(
+            tensor_model_parallel_size=calib_tp, pipeline_model_parallel_size=calib_pp, pipeline_dtype=torch.bfloat16
+        )
+
     trainer = nl.Trainer(
         devices=calib_tp,
         num_nodes=calib_pp,
-        strategy=nl.MegatronStrategy(
-            tensor_model_parallel_size=calib_tp, pipeline_model_parallel_size=calib_pp, pipeline_dtype=torch.bfloat16
-        ),
-        plugins=nl.MegatronMixedPrecision(precision='bf16', pipeline_dtype=torch.bfloat16, autocast_enabled=True),
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision='bf16', params_dtype=torch.bfloat16, autocast_enabled=True),
     )
-    fabric = trainer.to_fabric()
-    fabric.launch()
-
     model_path = Path(nemo_checkpoint_path)
-    model = nl.io.load_context(ckpt_to_context_subdir(model_path)).model
+    model = nl.io.load_context(path=ckpt_to_context_subdir(model_path), subpath="model")
     model.config = quantizable_model_config(model.config)
-    return fabric.load_model(nemo_checkpoint_path, model=model)
+
+    if inference_only:
+        del model.optim
+
+    _setup_trainer_and_restore_model(nemo_checkpoint_path, trainer, model)
+    return model
 
 
-def get_unwrapped_mcore_model(model: llm.GPTModel):
+def get_unwrapped_mcore_model(model):
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
 
     unwrapped_model = model