From 1fe50da0aa94f8b4b07566e6dbc5d98fca881724 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 30 May 2024 07:25:12 -0700
Subject: [PATCH] Fix P-tuning for Llama based models (#9300)

* Fix P-tuning for Llama based models (#9297)

* Added the BOS token for Llama, Mistral and Mixtral.

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>

* Don't load an existing TRT-LLM model before export to speed up the export process and avoid possible contamination from previous runs.

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: apanteleev <apanteleev@users.noreply.github.com>

---------

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>
Signed-off-by: apanteleev <apanteleev@users.noreply.github.com>
Co-authored-by: apanteleev <apanteleev@users.noreply.github.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>

* Fix the export test

---------

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>
Signed-off-by: apanteleev <apanteleev@users.noreply.github.com>
Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Alexey Panteleev <alpanteleev@nvidia.com>
Co-authored-by: apanteleev <apanteleev@users.noreply.github.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/export/trt_llm/tensorrt_llm_run.py | 8 +++++++-
 scripts/deploy/nlp/deploy_triton.py     | 1 +
 scripts/export/export_to_trt_llm.py     | 2 +-
 tests/export/test_nemo_export.py        | 2 +-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index fe0189b106286..1bdfd5237caf1 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -312,7 +312,13 @@ def load(
 
     max_batch_size = config["build_config"]["max_batch_size"]
     max_input_len = config["build_config"]["max_input_len"]
-    add_bos = True if config["pretrained_config"]["architecture"] == "GemmaForCausalLM" else False
+    architectures_that_need_bos_token = [
+        "GemmaForCausalLM",
+        "LLaMAForCausalLM",
+        "MistralForCausalLM",
+        "MixtralForCausalLM",
+    ]
+    add_bos = config["pretrained_config"]["architecture"] in architectures_that_need_bos_token
 
     return TensorrtLLMHostContext(
         executor=executor,
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 0a9604a73cdc2..5a2440b0fa2f7 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -216,6 +216,7 @@ def nemo_deploy(argv):
     trt_llm_exporter = TensorRTLLM(
         model_dir=trt_llm_path,
         lora_ckpt_list=args.lora_ckpt,
+        load_model=(args.nemo_checkpoint is None),
         use_python_runtime=(not args.use_cpp_runtime),
     )
 
diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
index ce9ef6a1e1328..a9c16bf8cff65 100644
--- a/scripts/export/export_to_trt_llm.py
+++ b/scripts/export/export_to_trt_llm.py
@@ -131,7 +131,7 @@ def nemo_export_trt_llm(argv):
         return
 
     try:
-        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository)
+        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository, load_model=False)
 
         LOGGER.info("Export to TensorRT-LLM function is called.")
         trt_llm_exporter.export(
diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py
index b3e186433561e..97a06a1f68877 100644
--- a/tests/export/test_nemo_export.py
+++ b/tests/export/test_nemo_export.py
@@ -200,7 +200,7 @@ def run_trt_llm_inference(
                 print("---- LoRA could not be enabled and skipping the test.")
                 return None, None, None, None, None
 
-        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list)
+        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list, load_model=False)
 
         trt_llm_exporter.export(
             nemo_checkpoint_path=checkpoint_path,