Skip to content

Commit

Permalink
remove padding param added
Browse files Browse the repository at this point in the history
  • Loading branch information
oyilmaz-nvidia committed May 15, 2024
1 parent 5d008c7 commit f53d1fd
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 3 deletions.
8 changes: 5 additions & 3 deletions nemo/export/tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,16 @@ def export(
max_batch_size: int = 8,
max_prompt_embedding_table_size=None,
use_parallel_embedding: bool = False,
enable_context_fmha: bool = True,
paged_kv_cache: bool = False,
remove_input_padding: bool = False,
dtype: str = "bfloat16",
load_model: bool = True,
enable_multi_block_mode: bool = False,
use_lora_plugin: str = None,
lora_target_modules: List[str] = None,
max_lora_rank: int = 64,
max_num_tokens: int = None,
opt_num_tokens: int = None,
save_nemo_model_config: bool = False,
):
"""
Expand All @@ -142,8 +143,7 @@ def export(
max_output_token (int): max output length.
max_batch_size (int): max batch size.
max_prompt_embedding_table_size (int): max prompt embedding size.
use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend.
enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention.
use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not
paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
dtype (str): Floating point type for model weights (Supports BFloat16/Float16).
load_model (bool): load TensorRT-LLM model after the export.
Expand Down Expand Up @@ -239,7 +239,9 @@ def export(
max_prompt_embedding_table_size=max_prompt_embedding_table_size,
enable_multi_block_mode=enable_multi_block_mode,
paged_kv_cache=paged_kv_cache,
remove_input_padding=remove_input_padding,
max_num_tokens=max_num_tokens,
opt_num_tokens=opt_num_tokens,
)

tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
Expand Down
2 changes: 2 additions & 0 deletions nemo/export/trt_llm/tensorrt_llm_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ def build_and_save_engine(
max_prompt_embedding_table_size=0,
enable_multi_block_mode: bool = False,
paged_kv_cache: bool = False,
remove_input_padding: bool = False,
max_num_tokens: int = None,
opt_num_tokens: int = None,
max_beam_width: int = 1,
Expand Down Expand Up @@ -402,6 +403,7 @@ def build_and_save_engine(
'strongly_typed': False,
'builder_opt': None,
'paged_kv_cache': paged_kv_cache,
'remove_input_padding': remove_input_padding,
}
build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)

Expand Down

0 comments on commit f53d1fd

Please sign in to comment.