From 1fe50da0aa94f8b4b07566e6dbc5d98fca881724 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 30 May 2024 07:25:12 -0700 Subject: [PATCH] Fix P-tuning for Llama based models (#9300) * Fix P-tuning for Llama based models (#9297) * Added the BOS token for Llama, Mistral and Mixtral. Signed-off-by: Alexey Panteleev * Don't load an existing TRT-LLM model before export to speed up the export process and avoid possible contamination from previous runs. Signed-off-by: Alexey Panteleev * Apply isort and black reformatting Signed-off-by: apanteleev --------- Signed-off-by: Alexey Panteleev Signed-off-by: apanteleev Co-authored-by: apanteleev Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> * Fix the export test --------- Signed-off-by: Alexey Panteleev Signed-off-by: apanteleev Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Co-authored-by: Alexey Panteleev Co-authored-by: apanteleev Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Signed-off-by: Jan Lasek --- nemo/export/trt_llm/tensorrt_llm_run.py | 8 +++++++- scripts/deploy/nlp/deploy_triton.py | 1 + scripts/export/export_to_trt_llm.py | 2 +- tests/export/test_nemo_export.py | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index fe0189b106286..1bdfd5237caf1 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -312,7 +312,13 @@ def load( max_batch_size = config["build_config"]["max_batch_size"] max_input_len = config["build_config"]["max_input_len"] - add_bos = True if config["pretrained_config"]["architecture"] == "GemmaForCausalLM" else False + architectures_that_need_bos_token = [ + "GemmaForCausalLM", + "LLaMAForCausalLM", + "MistralForCausalLM", + "MixtralForCausalLM", + ] + add_bos = config["pretrained_config"]["architecture"] in architectures_that_need_bos_token return TensorrtLLMHostContext( executor=executor, diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index 0a9604a73cdc2..5a2440b0fa2f7 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -216,6 +216,7 @@ def nemo_deploy(argv): trt_llm_exporter = TensorRTLLM( model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt, + load_model=(args.nemo_checkpoint is None), use_python_runtime=(not args.use_cpp_runtime), ) diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py index ce9ef6a1e1328..a9c16bf8cff65 100644 --- a/scripts/export/export_to_trt_llm.py +++ b/scripts/export/export_to_trt_llm.py @@ -131,7 +131,7 @@ def nemo_export_trt_llm(argv): return try: - trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository) + trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository, load_model=False) LOGGER.info("Export to TensorRT-LLM function is called.") trt_llm_exporter.export( diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py index b3e186433561e..97a06a1f68877 100644 --- a/tests/export/test_nemo_export.py +++ b/tests/export/test_nemo_export.py @@ -200,7 +200,7 @@ def run_trt_llm_inference( print("---- LoRA could not be enabled and skipping the test.") return None, None, None, None, None - trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list) + trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list, load_model=False) trt_llm_exporter.export( nemo_checkpoint_path=checkpoint_path,