remove padding param added

NVIDIA · May 15, 2024 · f53d1fd · f53d1fd
1 parent 5d008c7
commit f53d1fd
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 3 deletions.
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
@@ -117,15 +117,16 @@ def export(
         max_batch_size: int = 8,
         max_prompt_embedding_table_size=None,
         use_parallel_embedding: bool = False,
-        enable_context_fmha: bool = True,
         paged_kv_cache: bool = False,
+        remove_input_padding: bool = False,
         dtype: str = "bfloat16",
         load_model: bool = True,
         enable_multi_block_mode: bool = False,
         use_lora_plugin: str = None,
         lora_target_modules: List[str] = None,
         max_lora_rank: int = 64,
         max_num_tokens: int = None,
+        opt_num_tokens: int = None,
         save_nemo_model_config: bool = False,
     ):
         """
@@ -142,8 +143,7 @@ def export(
             max_output_token (int): max output length.
             max_batch_size (int): max batch size.
             max_prompt_embedding_table_size (int): max prompt embedding size.
-            use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend.
-            enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention.
+            use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not
             paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
             dtype (str): Floating point type for model weights (Supports BFloat16/Float16).
             load_model (bool): load TensorRT-LLM model after the export.
@@ -239,7 +239,9 @@ def export(
                         max_prompt_embedding_table_size=max_prompt_embedding_table_size,
                         enable_multi_block_mode=enable_multi_block_mode,
                         paged_kv_cache=paged_kv_cache,
+                        remove_input_padding=remove_input_padding,
                         max_num_tokens=max_num_tokens,
+                        opt_num_tokens=opt_num_tokens,
                     )
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")

diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -372,6 +372,7 @@ def build_and_save_engine(
     max_prompt_embedding_table_size=0,
     enable_multi_block_mode: bool = False,
     paged_kv_cache: bool = False,
+    remove_input_padding: bool = False,
     max_num_tokens: int = None,
     opt_num_tokens: int = None,
     max_beam_width: int = 1,
@@ -402,6 +403,7 @@ def build_and_save_engine(
         'strongly_typed': False,
         'builder_opt': None,
         'paged_kv_cache': paged_kv_cache,
+        'remove_input_padding': remove_input_padding,
     }
     build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)