From f53d1fd28ca43497ae8125a288e069af50973c29 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Wed, 15 May 2024 18:07:22 -0400 Subject: [PATCH] remove padding param added --- nemo/export/tensorrt_llm.py | 8 +++++--- nemo/export/trt_llm/tensorrt_llm_build.py | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index f40db3642d89..d8da88744b0e 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -117,8 +117,8 @@ def export( max_batch_size: int = 8, max_prompt_embedding_table_size=None, use_parallel_embedding: bool = False, - enable_context_fmha: bool = True, paged_kv_cache: bool = False, + remove_input_padding: bool = False, dtype: str = "bfloat16", load_model: bool = True, enable_multi_block_mode: bool = False, @@ -126,6 +126,7 @@ def export( lora_target_modules: List[str] = None, max_lora_rank: int = 64, max_num_tokens: int = None, + opt_num_tokens: int = None, save_nemo_model_config: bool = False, ): """ @@ -142,8 +143,7 @@ def export( max_output_token (int): max output length. max_batch_size (int): max batch size. max_prompt_embedding_table_size (int): max prompt embedding size. - use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend. - enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention. + use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM. dtype (str): Floating point type for model weights (Supports BFloat16/Float16). load_model (bool): load TensorRT-LLM model after the export. @@ -239,7 +239,9 @@ def export( max_prompt_embedding_table_size=max_prompt_embedding_table_size, enable_multi_block_mode=enable_multi_block_mode, paged_kv_cache=paged_kv_cache, + remove_input_padding=remove_input_padding, max_num_tokens=max_num_tokens, + opt_num_tokens=opt_num_tokens, ) tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index a0026bacd58b..49c2c83ad685 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -372,6 +372,7 @@ def build_and_save_engine( max_prompt_embedding_table_size=0, enable_multi_block_mode: bool = False, paged_kv_cache: bool = False, + remove_input_padding: bool = False, max_num_tokens: int = None, opt_num_tokens: int = None, max_beam_width: int = 1, @@ -402,6 +403,7 @@ def build_and_save_engine( 'strongly_typed': False, 'builder_opt': None, 'paged_kv_cache': paged_kv_cache, + 'remove_input_padding': remove_input_padding, } build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)