Skip to content

Commit

Permalink
fix lora and ptuning and isort/black (#9290) (#9295)
Browse files Browse the repository at this point in the history
* fix lora and ptuning and isort/black



* remove raise error when multiple config files



* Apply isort and black reformatting



* fix script issues



---------

Signed-off-by: Onur Yilmaz <[email protected]>
Signed-off-by: oyilmaz-nvidia <[email protected]>
Co-authored-by: Onur Yilmaz <[email protected]>
Co-authored-by: oyilmaz-nvidia <[email protected]>
Signed-off-by: Jan Lasek <[email protected]>
  • Loading branch information
3 people authored and janekl committed Jun 12, 2024
1 parent 9e8bc70 commit 4abcbe1
Show file tree
Hide file tree
Showing 15 changed files with 252 additions and 102 deletions.
4 changes: 0 additions & 4 deletions nemo/deploy/deploy_pytriton.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@


class DeployPyTriton(DeployBase):

"""
Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy.
Expand Down Expand Up @@ -102,7 +101,6 @@ def __init__(
)

def deploy(self):

"""
Deploys any models to Triton Inference Server.
"""
Expand Down Expand Up @@ -148,7 +146,6 @@ def deploy(self):
print(e)

def serve(self):

"""
Starts serving the model and waits for the requests
"""
Expand All @@ -163,7 +160,6 @@ def serve(self):
print(e)

def run(self):

"""
Starts serving the model asynchronously.
"""
Expand Down
3 changes: 2 additions & 1 deletion nemo/deploy/nlp/query_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ class NemoQueryLLM(NemoQueryLLMBase):

def __init__(self, url, model_name):
super().__init__(
url=url, model_name=model_name,
url=url,
model_name=model_name,
)

def query_llm(
Expand Down
18 changes: 15 additions & 3 deletions nemo/export/tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,24 @@ class TensorRTLLM(ITritonDeployable):
"""

def __init__(self, model_dir: str, lora_ckpt_list: List[str] = None, load_model: bool = True):
def __init__(
self,
model_dir: str,
lora_ckpt_list: List[str] = None,
load_model: bool = True,
use_python_runtime: bool = True,
):
"""
Args:
model_dir (str): path for storing the TensorRT-LLM model files.
lora_ckpt_list (List[str]): lora checkpoint paths.
load_model (bool): load TensorRT-LLM model if the engine files exist in the model_dir.
use_python_runtime (bool): whether to use python or c++ runtime.
"""

self.model_dir = model_dir
self.lora_ckpt_list = lora_ckpt_list
self.use_python_runtime = use_python_runtime
self.model = None
self.tokenizer = None
self.n_gpus = None
Expand Down Expand Up @@ -645,7 +654,7 @@ def _prep_ptuning_table(self):
if len(vtokens_embeddings) > 0:
self.p_table = torch.stack(vtokens_embeddings, dim=0).view(-1, self.get_hidden_size)

max_prompt_embedding_table_size = self.config['builder_config']['max_prompt_embedding_table_size']
max_prompt_embedding_table_size = self.config['build_config']['max_prompt_embedding_table_size']
actual_prompt_table_size = self.p_table.shape[0]

if actual_prompt_table_size > max_prompt_embedding_table_size:
Expand Down Expand Up @@ -776,7 +785,10 @@ def _load(self):
self._load_config_file()
self.tokenizer = get_tokenzier(Path(os.path.join(self.model_dir)))
self.model = load(
tokenizer=self.tokenizer, engine_dir=self.model_dir, lora_ckpt_list=self.lora_ckpt_list
tokenizer=self.tokenizer,
engine_dir=self.model_dir,
lora_ckpt_list=self.lora_ckpt_list,
use_python_runtime=self.use_python_runtime,
)
self._load_prompt_tables()
except Exception as error:
Expand Down
6 changes: 5 additions & 1 deletion nemo/export/trt_llm/decoder/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,11 @@ def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
pass

def __init__(
self, decoder_type: str, dtype: trt.DataType = trt.float16, rank: int = 0, tensor_parallel: int = 1,
self,
decoder_type: str,
dtype: trt.DataType = trt.float16,
rank: int = 0,
tensor_parallel: int = 1,
):
"""Initializes the DecoderLayerConfigBuilder."""
self.decoder_type = decoder_type
Expand Down
29 changes: 24 additions & 5 deletions nemo/export/trt_llm/decoder/falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,11 @@ def build_attention(self, layer) -> AttentionConfig:
)

config.dense = LinearConfig.from_nn_module(
layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.self_attn.o_proj,
LINEAR_ROW,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)

return config
Expand All @@ -78,13 +82,25 @@ def build_attention(self, layer) -> AttentionConfig:
def build_mlp(self, layer) -> MLPConfig:
config = MLPConfig()
config.fc = LinearConfig.from_nn_module(
layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.gate_proj,
LINEAR_COLUMN,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)
config.proj = LinearConfig.from_nn_module(
layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.down_proj,
LINEAR_ROW,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)
config.gate = LinearConfig.from_nn_module(
layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.up_proj,
LINEAR_COLUMN,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)

return config
Expand Down Expand Up @@ -130,4 +146,7 @@ def build_decoder(self, layer):
config.set_if_not_exist('bias', False)
config.set_if_not_exist('moe_num_experts', 0)

return FalconDecoderLayer(config=config, layer_idx=self.layer_id,)
return FalconDecoderLayer(
config=config,
layer_idx=self.layer_id,
)
29 changes: 24 additions & 5 deletions nemo/export/trt_llm/decoder/gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,11 @@ def build_attention(self, layer) -> AttentionConfig:
)

config.dense = LinearConfig.from_nn_module(
layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.self_attn.o_proj,
LINEAR_ROW,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)

return config
Expand All @@ -73,13 +77,25 @@ def build_attention(self, layer) -> AttentionConfig:
def build_mlp(self, layer) -> MLPConfig:
config = MLPConfig()
config.fc = LinearConfig.from_nn_module(
layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.gate_proj,
LINEAR_COLUMN,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)
config.proj = LinearConfig.from_nn_module(
layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.down_proj,
LINEAR_ROW,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)
config.gate = LinearConfig.from_nn_module(
layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.up_proj,
LINEAR_COLUMN,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)

return config
Expand Down Expand Up @@ -128,4 +144,7 @@ def build_decoder(self, layer):
config.set_if_not_exist('dense_context_fmha', False)
config.set_if_not_exist('moe_num_experts', 0)

return GemmaDecoderLayer(config=config, layer_idx=self.layer_id,)
return GemmaDecoderLayer(
config=config,
layer_idx=self.layer_id,
)
28 changes: 23 additions & 5 deletions nemo/export/trt_llm/decoder/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,18 @@ def build_input_layernorm(self, layer) -> LayernormConfig:
def build_attention(self, layer) -> AttentionConfig:
config = AttentionConfig()
config.qkv = LinearConfig.from_qkv_nn_modules(
[layer.attn.c_attn], rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
[layer.attn.c_attn],
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)

config.dense = LinearConfig.from_nn_module(
layer.attn.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.attn.c_proj,
LINEAR_ROW,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)

return config
Expand All @@ -67,10 +74,18 @@ def build_attention(self, layer) -> AttentionConfig:
def build_mlp(self, layer) -> MLPConfig:
config = MLPConfig()
config.fc = LinearConfig.from_nn_module(
layer.mlp.c_fc, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.c_fc,
LINEAR_COLUMN,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)
config.proj = LinearConfig.from_nn_module(
layer.mlp.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.c_proj,
LINEAR_ROW,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)

return config
Expand Down Expand Up @@ -126,4 +141,7 @@ def build_decoder(self, layer):
config.set_if_not_exist('rotary_pct', rotary_pct)
config.set_if_not_exist('moe_num_experts', 0)

return GPTDecoderLayer(config=config, layer_idx=self.layer_id,)
return GPTDecoderLayer(
config=config,
layer_idx=self.layer_id,
)
18 changes: 15 additions & 3 deletions nemo/export/trt_llm/decoder/gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ def build_attention(self, layer) -> AttentionConfig:
)

config.dense = LinearConfig.from_nn_module(
layer.attn.out_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.attn.out_proj,
LINEAR_ROW,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)

config.rotary_dim = layer.attn.rotary_dim
Expand All @@ -71,10 +75,18 @@ def build_attention(self, layer) -> AttentionConfig:
def build_mlp(self, layer) -> MLPConfig:
config = MLPConfig()
config.fc = LinearConfig.from_nn_module(
layer.mlp.fc_in, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.fc_in,
LINEAR_COLUMN,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)
config.proj = LinearConfig.from_nn_module(
layer.mlp.fc_out, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.fc_out,
LINEAR_ROW,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)

return config
Expand Down
29 changes: 24 additions & 5 deletions nemo/export/trt_llm/decoder/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,11 @@ def build_attention(self, layer) -> AttentionConfig:
)

config.dense = LinearConfig.from_nn_module(
layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.self_attn.o_proj,
LINEAR_ROW,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)

return config
Expand All @@ -75,13 +79,25 @@ def build_attention(self, layer) -> AttentionConfig:
def build_mlp(self, layer) -> MLPConfig:
config = MLPConfig()
config.fc = LinearConfig.from_nn_module(
layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.gate_proj,
LINEAR_COLUMN,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)
config.proj = LinearConfig.from_nn_module(
layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.down_proj,
LINEAR_ROW,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)
config.gate = LinearConfig.from_nn_module(
layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
layer.mlp.up_proj,
LINEAR_COLUMN,
rank=self.rank,
tensor_parallel=self.tensor_parallel,
dtype=self.dtype,
)

return config
Expand Down Expand Up @@ -147,4 +163,7 @@ def build_decoder(self, layer):
config.moe_tp_mode = layer.moe_tp_mode
config.moe_normalization_mode = layer.moe_renorm_mode

return LLaMADecoderLayer(config=config, layer_idx=self.layer_id,)
return LLaMADecoderLayer(
config=config,
layer_idx=self.layer_id,
)
Loading

0 comments on commit 4abcbe1

Please sign in to comment.