From f53d1fd28ca43497ae8125a288e069af50973c29 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <oyilmaz@nvidia.com>
Date: Wed, 15 May 2024 18:07:22 -0400
Subject: [PATCH] remove padding param added

---
 nemo/export/tensorrt_llm.py               | 8 +++++---
 nemo/export/trt_llm/tensorrt_llm_build.py | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index f40db3642d89..d8da88744b0e 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -117,8 +117,8 @@ def export(
         max_batch_size: int = 8,
         max_prompt_embedding_table_size=None,
         use_parallel_embedding: bool = False,
-        enable_context_fmha: bool = True,
         paged_kv_cache: bool = False,
+        remove_input_padding: bool = False,
         dtype: str = "bfloat16",
         load_model: bool = True,
         enable_multi_block_mode: bool = False,
@@ -126,6 +126,7 @@ def export(
         lora_target_modules: List[str] = None,
         max_lora_rank: int = 64,
         max_num_tokens: int = None,
+        opt_num_tokens: int = None,
         save_nemo_model_config: bool = False,
     ):
         """
@@ -142,8 +143,7 @@ def export(
             max_output_token (int): max output length.
             max_batch_size (int): max batch size.
             max_prompt_embedding_table_size (int): max prompt embedding size.
-            use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend.
-            enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention.
+            use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not
             paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
             dtype (str): Floating point type for model weights (Supports BFloat16/Float16).
             load_model (bool): load TensorRT-LLM model after the export.
@@ -239,7 +239,9 @@ def export(
                         max_prompt_embedding_table_size=max_prompt_embedding_table_size,
                         enable_multi_block_mode=enable_multi_block_mode,
                         paged_kv_cache=paged_kv_cache,
+                        remove_input_padding=remove_input_padding,
                         max_num_tokens=max_num_tokens,
+                        opt_num_tokens=opt_num_tokens,
                     )
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index a0026bacd58b..49c2c83ad685 100644
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -372,6 +372,7 @@ def build_and_save_engine(
     max_prompt_embedding_table_size=0,
     enable_multi_block_mode: bool = False,
     paged_kv_cache: bool = False,
+    remove_input_padding: bool = False,
     max_num_tokens: int = None,
     opt_num_tokens: int = None,
     max_beam_width: int = 1,
@@ -402,6 +403,7 @@ def build_and_save_engine(
         'strongly_typed': False,
         'builder_opt': None,
         'paged_kv_cache': paged_kv_cache,
+        'remove_input_padding': remove_input_padding,
     }
     build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)