Skip to content

Commit

Permalink
Adding LoKrModel Class to paddle.peft library (PaddlePaddle#9269)
Browse files Browse the repository at this point in the history
* passing pre-commit

* removing tp and pp logic for single gpu training

* add disable_lokr attribute in lokr_layer

* refine comments

* add lokr tests and modified layer bug

* add lokrtests

* add lokrtests

* add lokr_argument.json

* add integration test, fix bugs based on tests.

* refactor lora_dim to lokr_dim

* no inference

* add more tests

* resolve merge conflict

* add more randtests

* pass isort check(maybe)
  • Loading branch information
WhuanY authored Nov 27, 2024
1 parent 8d8a42c commit 3ef14dc
Show file tree
Hide file tree
Showing 15 changed files with 1,297 additions and 2 deletions.
34 changes: 34 additions & 0 deletions llm/config/llama/lokr_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3-8B",
"dataset_name_or_path": "./data",
"output_dir": "./checkpoints/lokr_ckpts",
"lokr": true,
"per_device_train_batch_size": 4,
"gradient_accumulation_steps": 4,
"num_train_epochs": 1,
"learning_rate": 2e-05,
"lr_scheduler_type": "linear",
"attention_probs_dropout_prob": 0,
"hidden_dropout_prob": 0,
"warmup_steps": 30,
"logging_steps": 1,
"evaluation_strategy": "no",
"save_strategy": "steps",
"save_steps": 500,
"src_length": 512,
"max_length": 512,
"bf16": true,
"do_train": true,
"do_eval": false,
"disable_tqdm": false,
"load_best_model_at_end": false,
"eval_with_do_generation": false,
"metric_for_best_model": "accuracy",
"recompute": false,
"save_total_limit": 100,
"fp16_opt_level": "O2",
"sharding": "stage2",
"zero_padding": false,
"use_flash_attention": false,
"unified_checkpoint": true
}
2 changes: 2 additions & 0 deletions llm/docs/finetune.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ python merge_lora_params.py \
- `neftune_noise_alpha`: NEFT alpha 参数,默认为5.0。
- `vera`: 是否开启 VeRA 微调策略,默认为 False。
- `vera_rank`: VeRA 算法中 rank(秩)的值,默认为8。
- `lokr`: 是否开启 LoKr 微调策略,默认为 False。
- `lokr_rank`: LoKr 算法中 rank(秩)的值,默认为8。
- `use_long_sequence_strategies`: 是否使用长序列扩展策略,默认为 False。
- `strategy_type`: 长序列扩展策略的类型,默认为 None。
- `strategy_name`: 长序列扩展策略的具体名称,默认为 None。
Expand Down
17 changes: 17 additions & 0 deletions llm/run_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
)
from paddlenlp.metrics import BLEU, Rouge1, Rouge2, RougeL
from paddlenlp.peft import (
LoKrConfig,
LoKrModel,
LoRAConfig,
LoRAModel,
PrefixConfig,
Expand Down Expand Up @@ -451,6 +453,21 @@ def neft_post_hook(module, input, output):

model.print_trainable_parameters()

if model_args.lokr:
if model_args.lokr_path is None:
target_modules = get_lora_target_modules(model)
lokr_config = LoKrConfig(
target_modules=target_modules,
lokr_dim=model_args.lokr_dim,
dtype=dtype,
base_model_name_or_path=model_args.model_name_or_path,
)
model = LoKrModel(model, lokr_config)
else:
model = LoKrModel.from_pretrained(model=model, lokr_path=model_args.lokr_path)

# For debugging purpose, you can print the model to see which layer is transformed into a lokr layer
# print(model)
if model_args.reft:
intervention_dtype = dtype
intervention_params = {
Expand Down
116 changes: 116 additions & 0 deletions llm/tools/merge_lokr_params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os

import paddle

from paddlenlp.peft import LoKrConfig, LoKrModel
from paddlenlp.transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from paddlenlp.utils.env import CONFIG_NAME


def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--model_name_or_path", default=None, help="The directory of pretrained model.")
parser.add_argument("--lokr_path", default="", help="The directory of lokr parameters. Default to None")
parser.add_argument(
"--merge_lokr_model_path",
default="",
help="The directory of merged parameters. Default to None",
)
parser.add_argument("--device", type=str, default="gpu", help="Device")
parser.add_argument(
"--low_gpu_mem", type=bool, default=True, help="Whether to use low gpu memory. Default to False"
)
return parser.parse_args()


def weight_process(name, lokr_config, state_dict):
weight = state_dict.pop(name + ".weight")
use_w1 = True if ((name + ".lokr_w1") in state_dict) else False
use_w2 = True if ((name + ".lokr_w2") in state_dict) else False
if use_w1:
lokr_w1 = state_dict.pop(name + ".lokr_w1")
else:
lokr_w1_a = state_dict.pop(name + ".lokr_w1_a")
lokr_w1_b = state_dict.pop(name + ".lokr_w1_b")
if use_w2:
lokr_w2 = state_dict.pop(name + ".lokr_w2")
else:
lokr_w2_a = state_dict.pop(name + ".lokr_w2_a")
lokr_w2_b = state_dict.pop(name + ".lokr_w2_b")

scaling = lokr_config.lokr_alpha / lokr_config.lokr_dim

adapter_weight = (
scaling
* paddle.kron(lokr_w1 if use_w1 else lokr_w1_a @ lokr_w1_b, lokr_w2 if use_w2 else lokr_w2_a @ lokr_w2_b).T
)
state_dict[name + ".weight"] = weight + adapter_weight


def merge():
args = parse_arguments()
paddle.set_device(args.device)

lokr_config = LoKrConfig.from_pretrained(args.lokr_path)
if lokr_config.base_model_name_or_path is None:
if args.model_name_or_path is not None:
raise ValueError("We can not find a valid model_name_or_path.")
else:
lokr_config.base_model_name_or_path = args.model_name_or_path

if os.path.isfile(os.path.join(args.lokr_path, CONFIG_NAME)):
config = AutoConfig.from_pretrained(args.lokr_path)
elif args.model_name_or_path is not None:
config = AutoConfig.from_pretrained(args.model_name_or_path)
else:
raise ValueError(
f"We can not find config.json in lokr_path: {args.lokr_path} or find a valid model_name_or_path."
)
config.dtype = lokr_config.dtype
if (
lokr_config.dtype == "bfloat16" or config.quantization_config.weight_quantize_algo in ["nf4", "fp4"]
) and args.device == "cpu":
raise ValueError("We can not apply bfloat16 or nf4/fp4 lokr merge on cpu.")

# with device_guard() will cause SVD decomposition to fail
model = AutoModelForCausalLM.from_pretrained(
lokr_config.base_model_name_or_path,
config=config,
low_cpu_mem_usage=True,
)
model = LoKrModel.from_pretrained(model=model, lokr_path=args.lokr_path, lokr_config=lokr_config)

model.eval()
model_state_dict = model.model.state_dict()
lokr_name_list = []

for key in model_state_dict.keys():
if "lokr" in key:
lokr_name_list.append(key.split(".lokr")[0])

lokr_name_list = list(set(lokr_name_list))
for name in lokr_name_list:
weight_process(name, lokr_config, model_state_dict)

model.model.save_pretrained(args.merge_lokr_model_path, state_dict=model_state_dict)
tokenizer = AutoTokenizer.from_pretrained(lokr_config.base_model_name_or_path)
tokenizer.save_pretrained(args.merge_lokr_model_path)


if __name__ == "__main__":
merge()
7 changes: 7 additions & 0 deletions llm/utils/argument.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,13 @@ class ModelArgument:
vera: bool = field(default=False, metadata={"help": "Whether to use vera technique"})
vera_rank: int = field(default=8, metadata={"help": "Vera attention dimension"})

# lokr related parameter
lokr: bool = field(default=False, metadata={"help": "Whether to use LoKr technique"})
lokr_path: str = field(
default=None, metadata={"help": "Initialize lokr state dict and apply customized lokr config"}
)
lokr_dim: int = field(default=8, metadata={"help": "Lora dimention in LoKr dimension for adapter matrix"})

# prefix tuning related parameters
prefix_tuning: bool = field(default=False, metadata={"help": "Whether to use Prefix technique"})
prefix_path: str = field(default=None, metadata={"help": "Initialize prefix state dict."})
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/peft/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.


from .lokr import LoKrConfig, LoKrModel
from .lora import LoRAConfig, LoRAModel
from .prefix import PrefixConfig, PrefixModelForCausalLM
from .reft import ReFTModel
Expand Down
19 changes: 19 additions & 0 deletions paddlenlp/peft/lokr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .lokr_config import LoKrConfig
from .lokr_layers import LoKrLinear
from .lokr_model import LoKrModel

__all__ = ["LoKrConfig", "LoKrModel", "LoKrLinear"]
141 changes: 141 additions & 0 deletions paddlenlp/peft/lokr/lokr_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
from dataclasses import asdict, dataclass, field
from typing import List, Optional, Union

from ...utils.env import LOKR_CONFIG_NAME


@dataclass
class LoKrConfig:
"""
This is the configuration class to store the configuration of a [`LoKrModel`].
Convention of LoKrModel: W1 can be named as scaling matrix, W2 can be named as adapter matrix.
Args:
target_modules (`Union[List[str],str]`): The names of the modules to apply Lora to.
trainable_modules (`List[str]`): The names of the modules to train when applying Lora.
lokr_alpha (`float`): The alpha parameter for Lora scaling.
merge_weights (`bool`):
Whether to merge the weights of the Lora layers with the base transformer model in `eval` mode.
"""

base_model_name_or_path: Optional[str] = field(
default=None, metadata={"help": "The name of the base model to use."}
)
target_modules: Optional[Union[List[str], str]] = field(
default=None,
metadata={
"help": "List of module names or regex expression of the module names to replace with LoKr."
"For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
},
)
trainable_modules: Optional[List[str]] = field(
default=None,
metadata={
"help": "List of module names or regex expression of the module names to train when applying with LoKr."
"For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
},
)
trainable_bias: Optional[str] = field(
default=None, metadata={"help": "Define trainable bias parameters for the Lora model."}
)
lokr_dim: int = field(default=8, metadata={"help": "Lora dimention in LoKr dimension, for adapter matrix"})
factor: int = field(default=-1, metadata={"help": "Determine the decomposition size of LoKr matrices"})
decompose_both: bool = field(
default=False,
metadata={"help": "Determine whether to decomposed both Scaling Matrix and adapter matrix together"},
)
lokr_alpha: float = field(
default=0.0, metadata={"help": "Determine the scaling of adapter weight, follow lokr convention"}
)
merge_weight: bool = field(
default=False, metadata={"help": "Merge weights of the original model and the Lokr model"}
)
tensor_parallel_degree: int = field(default=-1, metadata={"help": "-1 for not use tensor parallel"})
dtype: Optional[str] = field(default=None, metadata={"help": "The data type of tensor"})

@property
def __dict__(self):
return asdict(self)

def to_dict(self):
return self.__dict__

@property
def scaling(self):
if not (self.lokr_alpha or self.lokr_dim):
return 1.0
return self.lokr_alpha / self.lokr_dim

def save_pretrained(self, save_directory):
r"""
This method saves the configuration of your adapter model in a directory.
Args:
save_directory (`str`):
The directory where the configuration will be saved.
"""
if os.path.isfile(save_directory):
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")

os.makedirs(save_directory, exist_ok=True)

output_dict = self.__dict__
output_dict["scaling"] = self.scaling
output_path = os.path.join(save_directory, LOKR_CONFIG_NAME)

# save it
with open(output_path, "w") as writer:
writer.write(json.dumps(output_dict, indent=2, sort_keys=True))

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
This method loads the configuration of your adapter model from a directory.
Args:
pretrained_model_name_or_path (`str`):
The directory or the hub-id where the configuration is saved.
**kwargs:
Additional keyword arguments passed along to the child class initialization.
"""
if os.path.isfile(os.path.join(pretrained_model_name_or_path, LOKR_CONFIG_NAME)):
config_file = os.path.join(pretrained_model_name_or_path, LOKR_CONFIG_NAME)
else:
raise ValueError(f"Can't find lokr_config.json at '{pretrained_model_name_or_path}'")

loaded_attributes = cls.from_json_file(config_file)
loaded_attributes.pop("scaling", None)

config = cls(**kwargs)

for key, value in loaded_attributes.items():
if hasattr(config, key):
setattr(config, key, value)

return config

@classmethod
def from_json_file(cls, path_json_file):
r"""
Loads a configuration file from a json file.
Args:
path_json_file (`str`):
The path to the json file.
"""
with open(path_json_file, "r") as file:
json_object = json.load(file)

return json_object
Loading

0 comments on commit 3ef14dc

Please sign in to comment.