diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml index af561ffe0aad..96752696da41 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml @@ -96,6 +96,7 @@ model: lora_tuning: target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2) adapter_dim: 32 + alpha: ${model.peft.lora_tuning.adapter_dim} adapter_dropout: 0.0 column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index ac85ea7a1d2e..9690d5d21697 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -139,6 +139,7 @@ def __init__( input_is_parallel: bool = False, # NOTE: (@ertkonuk) we need this for LoRA adapters that are applied to RowParallelLinear layers dropout: float = 0.0, model_parallel_config: Optional[ModelParallelConfig] = None, + alpha: float | None = None, **kwargs, ): super().__init__() @@ -151,7 +152,9 @@ def __init__( self.activation = activation_registry[activation]() self.norm_position = norm_position self.dim = dim + self.alpha = alpha if alpha is not None else self.dim self.input_is_parallel = input_is_parallel + # megatron_gpt_peft_models will provide this arg, but deprecated ones do not. # in case this arg is not provided, use the dummy default config. if model_parallel_config is None: @@ -274,6 +277,8 @@ def forward(self, x): if self.dropout is not None: x = self.dropout(x) + x = x * (self.alpha / self.dim) + return x @@ -290,6 +295,7 @@ class ParallelLinearAdapterConfig(AdapterConfig): gather_output: bool = True input_is_parallel: bool = False dropout: float = 0.0 + alpha: float | None = None network_alpha: int | None = None _target_: str = "{0}.{1}".format(ParallelLinearAdapter.__module__, ParallelLinearAdapter.__name__) diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py index 815ad4d9e952..97305991d0b3 100644 --- a/nemo/collections/nlp/parts/peft_config.py +++ b/nemo/collections/nlp/parts/peft_config.py @@ -182,6 +182,7 @@ def _create_lora_config(self, cfg, lora_cfg, in_features, out_features, adapter_ "row_init_method": lora_cfg.get("row_init_method", "zero"), "gather_output": False, "dropout": lora_cfg.adapter_dropout, + "alpha": lora_cfg.get("alpha", lora_cfg.adapter_dim), } if lora_cfg.weight_tying: