Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LongLora for both full and lora fine-tuning #1350

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
135 changes: 135 additions & 0 deletions config_hub/finetune/llama-2-7b/longlora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@

# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
checkpoint_dir: checkpoints/meta-llama/Llama-2-7b-hf

# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
out_dir: out/finetune/lora-llama2-7b

# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
precision: bf16-true

# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
quantize:

# How many devices/GPUs to use. (type: Union[int, str], default: 1)
devices: 1

# The LoRA rank. (type: int, default: 8)
lora_r: 8

# The LoRA alpha. (type: int, default: 16)
lora_alpha: 16

# The LoRA dropout value. (type: float, default: 0.05)
lora_dropout: 0.0

# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
lora_query: true

# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
lora_key: true

# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
lora_value: true

# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
lora_projection: true

# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
lora_mlp: false

# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
lora_head: false

# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
data:
class_path: litgpt.data.Alpaca2k
init_args:
mask_prompt: false
prompt_style: alpaca
ignore_index: -100
seed: 42
num_workers: 4

# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
train:

# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
save_interval: 200

# Number of iterations between logging calls (type: int, default: 1)
log_interval: 1

# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
global_batch_size: 8

# Number of samples per data-parallel rank (type: int, default: 4)
micro_batch_size: 2

# Number of iterations with learning rate warmup active (type: int, default: 100)
lr_warmup_steps: 10

# Number of epochs to train on (type: Optional[int], default: 5)
epochs: 4

# Total number of tokens to train on (type: Optional[int], default: null)
max_tokens:

# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
max_steps:

# Limits the length of samples. Off by default (type: Optional[int], default: null)
max_seq_length: 512

# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
tie_embeddings:

# (type: float, default: 0.0003)
learning_rate: 0.0002

# (type: float, default: 0.02)
weight_decay: 0.0

# (type: float, default: 0.9)
beta1: 0.9

# (type: float, default: 0.95)
beta2: 0.95

# (type: Optional[float], default: null)
max_norm:

# (type: float, default: 6e-05)
min_lr: 6.0e-05

# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
eval:

# Number of optimizer steps between evaluation calls (type: int, default: 100)
interval: 100

# Number of tokens to generate (type: Optional[int], default: 100)
max_new_tokens: 100

# Number of iterations (type: int, default: 100)
max_iters: 100

# LongLoRA-related arguments. See ``litgpt.args.LongLoRAArgs`` for details
longlora:
belerico marked this conversation as resolved.
Show resolved Hide resolved
# Whether to use LongLoRA. (type: bool, default: false)
use_longlora: true

# The enlarged context length for LongLoRA. (type: int, default: 8192)
context_length: 8192

# The number of groups to split the sequence into. (type: int, default: 4)
n_groups: 4

# The additional trainable parameters for LongLoRA. (type: str, default: "wte,norm,ln")
trainable_params: "wte,norm,ln"

# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
logger_name: csv

# The random seed to use for reproducibility. (type: int, default: 1337)
seed: 1337
135 changes: 135 additions & 0 deletions config_hub/finetune/mistral-7b/longlora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@

# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
checkpoint_dir: checkpoints/mistralai/Mistral-7B-v0.1

# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
out_dir: out/finetune/lora-mistral-7b

# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
precision: bf16-true

# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
quantize:

# How many devices/GPUs to use. (type: Union[int, str], default: 1)
devices: 1

# The LoRA rank. (type: int, default: 8)
lora_r: 8

# The LoRA alpha. (type: int, default: 16)
lora_alpha: 16

# The LoRA dropout value. (type: float, default: 0.05)
lora_dropout: 0.0

# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
lora_query: true

# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
lora_key: true

# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
lora_value: true

# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
lora_projection: true

# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
lora_mlp: false

# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
lora_head: false

# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
data:
class_path: litgpt.data.Alpaca2k
init_args:
mask_prompt: false
prompt_style: alpaca
ignore_index: -100
seed: 42
num_workers: 4

# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
train:

# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
save_interval: 200

# Number of iterations between logging calls (type: int, default: 1)
log_interval: 1

# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
global_batch_size: 8

# Number of samples per data-parallel rank (type: int, default: 4)
micro_batch_size: 2

# Number of iterations with learning rate warmup active (type: int, default: 100)
lr_warmup_steps: 10

# Number of epochs to train on (type: Optional[int], default: 5)
epochs: 4

# Total number of tokens to train on (type: Optional[int], default: null)
max_tokens:

# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
max_steps:

# Limits the length of samples. Off by default (type: Optional[int], default: null)
max_seq_length: 512

# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
tie_embeddings:

# (type: float, default: 0.0003)
learning_rate: 0.0002

# (type: float, default: 0.02)
weight_decay: 0.0

# (type: float, default: 0.9)
beta1: 0.9

# (type: float, default: 0.95)
beta2: 0.95

# (type: Optional[float], default: null)
max_norm:

# (type: float, default: 6e-05)
min_lr: 6.0e-05

# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
eval:

# Number of optimizer steps between evaluation calls (type: int, default: 100)
interval: 100

# Number of tokens to generate (type: Optional[int], default: 100)
max_new_tokens: 100

# Number of iterations (type: int, default: 100)
max_iters: 100

# LongLoRA-related arguments. See ``litgpt.args.LongLoRAArgs`` for details
longlora:
# Whether to use LongLoRA. (type: bool, default: false)
use_longlora: true

# The enlarged context length for LongLoRA. (type: int, default: 8192)
context_length: 8192

# The number of groups to split the sequence into. (type: int, default: 4)
n_groups: 4

# The additional trainable parameters for LongLoRA. (type: str, default: "wte,norm,ln")
trainable_params: "wte,norm,ln"

# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
logger_name: csv

# The random seed to use for reproducibility. (type: int, default: 1337)
seed: 1337
22 changes: 14 additions & 8 deletions litgpt/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,6 @@ class TrainArgs:
max_norm: Optional[float] = None
min_lr: float = 6e-5

def __post_init__(self) -> None:
if self.lr_warmup_fraction and self.lr_warmup_steps:
raise ValueError(
"Can't provide both `--train.lr_warmup_fraction` and `--train.lr_warmup_steps`. Choose one."
)
if self.lr_warmup_fraction and not (0 <= self.lr_warmup_fraction <= 1):
raise ValueError("`--train.lr_warmup_fraction` must be between 0 and 1.")

def gradient_accumulation_iters(self, devices: int) -> int:
"""Number of iterations between gradient synchronizations"""
gradient_accumulation_iters = self.batch_size(devices) // self.micro_batch_size
Expand Down Expand Up @@ -79,3 +71,17 @@ class EvalArgs:
"""Number of tokens to generate"""
max_iters: int = 100
"""Number of iterations"""


@dataclass
class LongLoraArgs:
"""GaLore-related arguments"""
belerico marked this conversation as resolved.
Show resolved Hide resolved

use_longlora: bool = False
"""Whether to enable LongLora."""
n_groups: int = 4
"""Number of groups to divide the sequence length into."""
context_length: int = 8192
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder here what happens if the model has a longer context already. A good test case could be LongChat (supported in LitGPT).

I wonder if this should be a factor (2x the original context length) or None by default and then infer 2x the original context length.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've put a double check: one in the validate_longlora_args where if LongLora is used and longlora_context_length <= model.block_size then a warning is raised and LongLora is disabled; the other one before the model creation where I increase the model block-size and RoPE-condense-ratio only if LongLora is enabled and longlora_context_length > model.block_size. I can remove the second check and we can fallback to None as default, and in that case infer the 2x

"""Length of the enlarged context window."""
trainable_params: str = "wte,norm,ln"
"""List of comma-separated parameters to train in LongLora."""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are the other options? Are "wte,norm,ln" the only allowed ones or are there more?

Copy link
Contributor Author

@belerico belerico Apr 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh sorry, I wasn't clear. I meant more like what are the supported options here? What values can a user typically put in? ...

But you probably can't use Literal here because of the various combinations within that string. But in the comments, maybe could you mention which of the terms within that comma-separated string are supported?

Sorry, i didn't get it. I was looking at the model and if i'm not missing something i think that those are the only ones left other than the LoRA layers (controlled by the arguments in the finetune/lora.py script). I can add a check to prevent the user to input anything other a combination of those three layers?

4 changes: 3 additions & 1 deletion litgpt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ class Config:
rope_base: int = 10000
n_expert: int = 0
n_expert_per_token: int = 0
use_longlora: bool = False
longlora_n_groups: int = 4

def __post_init__(self):
if not self.name:
Expand Down Expand Up @@ -836,7 +838,7 @@ def norm_class(self) -> Type:
copy["name"] = c["name"].format(kind)
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
configs.append(copy)


###############
# Meta LLaMA 3
Expand Down
16 changes: 13 additions & 3 deletions litgpt/data/alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class Alpaca(DataModule):
tokenizer: Optional[Tokenizer] = field(default=None, init=False, repr=False)
batch_size: int = field(default=1, init=False, repr=False)
max_seq_length: int = field(default=-1, init=False, repr=False)
pad_multiple_of: Optional[int] = field(default=None, init=False, repr=False)
train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)

Expand All @@ -51,11 +52,16 @@ def __post_init__(self) -> None:
self.prompt_style = PromptStyle.from_name(self.prompt_style)

def connect(
self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None
self,
tokenizer: Optional[Tokenizer] = None,
batch_size: int = 1,
max_seq_length: Optional[int] = None,
pad_multiple_of: Optional[int] = None,
) -> None:
self.tokenizer = tokenizer
self.batch_size = batch_size
self.max_seq_length = -1 if max_seq_length is None else max_seq_length
self.pad_multiple_of = pad_multiple_of

def prepare_data(self) -> None:
self.download_dir.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -97,7 +103,9 @@ def train_dataloader(self) -> DataLoader:
shuffle=True,
generator=torch.Generator().manual_seed(self.seed),
num_workers=self.num_workers,
collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index),
collate_fn=get_sft_collate_fn(
max_seq_length=self.max_seq_length, ignore_index=self.ignore_index, pad_multiple_of=self.pad_multiple_of
),
)

def val_dataloader(self) -> DataLoader:
Expand All @@ -106,7 +114,9 @@ def val_dataloader(self) -> DataLoader:
batch_size=self.batch_size,
shuffle=False,
num_workers=self.num_workers,
collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index),
collate_fn=get_sft_collate_fn(
max_seq_length=self.max_seq_length, ignore_index=self.ignore_index, pad_multiple_of=self.pad_multiple_of
),
)


Expand Down
Loading
Loading