diff --git a/config_hub/finetune/README.md b/config_hub/finetune/README.md index 05a1dbeab6..c31a862380 100644 --- a/config_hub/finetune/README.md +++ b/config_hub/finetune/README.md @@ -11,9 +11,12 @@ For more information, see the [Dealing with out-of-memory (OOM) errors](../../tu | falcon-7b/lora.yaml | 7B | Alpaca 2k | 4 | 0.945 | 16.69 GB | 512 | 2 | bfloat16 | 24.88 min (1xA10G) | | falcon-7b/qlora.yaml | 7B | Alpaca 2k | 4 | 0.993 | 9.44 GB | 512 | 2 | bfloat16 | 50.76 min (1xA10G) | | | | | | | | | | | | -| gemma-2b/lora.yaml | 2B | Alpaca 2k | 3 | 1.476 | 12.62 GB | 512 | 2 | bfloat16 | 18.31 min (1xA10G) | -| gemma-2b/qlora.yaml | 2B | Alpaca 2k | 3 | 1.626 | 11.51 GB | 512 | 2 | bfloat16 | 25.29 min (1xA10G) | -| gemma-2b/full.yaml | 2B | Alpaca 2k | 0.35 | 1.046 | 18.47 GB | 512 | 2 | bfloat16 | 16.79 min (2xA10G) | +| gemma-2b/lora.yaml | 2B | Alpaca 2k | 2 | 1.476 | 12.62 GB | 512 | 2 | bfloat16 | 9.29 min (1xA10G) | +| gemma-2b/qlora.yaml | 2B | Alpaca 2k | 2 | 0.981 | 11.59 GB | 512 | 2 | bfloat16 | 12.90 min (1xA10G) | +| gemma-2b/full.yaml | 2B | Alpaca 2k | 0.35 | 0.990 | 17.43 GB | 512 | 1 | bfloat16 | 13.61 min (4xA10G) | +| | | | | | | | | | | +| gemma-7b/lora.yaml | 7B | Alpaca 2k | 2 | 0.903 | 25.30 GB | 512 | 1 | bfloat16 | 11.47 min (1xA100) | +| gemma-7b/qlora.yaml | 7B | Alpaca 2k | 2 | 0.951 | 17.31 GB | 512 | 1 | bfloat16 | 23.46 min (1xA100) | | | | | | | | | | | | | llama-2-7b/lora.yaml | 7B | Alpaca 2k | 4 | 0.802 | 19.77 GB | 512 | 2 | bfloat16 | 32.75 min (A10G) | | llama-2-7b/qlora.yaml | 7B | Alpaca 2k | 4 | 0.814 | 13.68 GB | 512 | 2 | bfloat16 | 45.68 min (A10G) | diff --git a/config_hub/finetune/gemma-2b/full.yaml b/config_hub/finetune/gemma-2b/full.yaml index 509a2675e4..77f20658ca 100644 --- a/config_hub/finetune/gemma-2b/full.yaml +++ b/config_hub/finetune/gemma-2b/full.yaml @@ -9,7 +9,7 @@ out_dir: out/finetune/full-gemma-2b precision: bf16-true # How many devices/GPUs to use. (type: Union[int, str], default: 1) -devices: 1 +devices: 4 # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. data: @@ -32,7 +32,7 @@ train: log_interval: 1 # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) - global_batch_size: 6 + global_batch_size: 16 # Number of samples per data-parallel rank (type: int, default: 4) micro_batch_size: 1 @@ -41,13 +41,13 @@ train: lr_warmup_steps: 100 # Number of epochs to train on (type: Optional[int], default: 5) - epochs: 3 + epochs: 1 # Total number of tokens to train on (type: Optional[int], default: null) max_tokens: # Limits the number of optimizer steps to run. (type: Optional[int], default: null) - max_steps: + max_steps: 50 # Limits the length of samples. Off by default (type: Optional[int], default: null) max_seq_length: 512 diff --git a/config_hub/finetune/gemma-2b/lora.yaml b/config_hub/finetune/gemma-2b/lora.yaml index 72d56fc22b..c9f912a47c 100644 --- a/config_hub/finetune/gemma-2b/lora.yaml +++ b/config_hub/finetune/gemma-2b/lora.yaml @@ -15,7 +15,7 @@ quantize: devices: 1 # The LoRA rank. (type: int, default: 8) -lora_r: 16 +lora_r: 8 # The LoRA alpha. (type: int, default: 16) lora_alpha: 16 @@ -71,7 +71,7 @@ train: lr_warmup_steps: 200 # Number of epochs to train on (type: Optional[int], default: 5) - epochs: 4 + epochs: 2 # Total number of tokens to train on (type: Optional[int], default: null) max_tokens: diff --git a/config_hub/finetune/gemma-2b/qlora.yaml b/config_hub/finetune/gemma-2b/qlora.yaml index 4c26c9cee8..dc15fe90d3 100644 --- a/config_hub/finetune/gemma-2b/qlora.yaml +++ b/config_hub/finetune/gemma-2b/qlora.yaml @@ -71,7 +71,7 @@ train: lr_warmup_steps: 200 # Number of epochs to train on (type: Optional[int], default: 5) - epochs: 4 + epochs: 2 # Total number of tokens to train on (type: Optional[int], default: null) max_tokens: diff --git a/config_hub/finetune/gemma-7b/lora.yaml b/config_hub/finetune/gemma-7b/lora.yaml new file mode 100644 index 0000000000..d7d56f5b5c --- /dev/null +++ b/config_hub/finetune/gemma-7b/lora.yaml @@ -0,0 +1,122 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/google/gemma-7b + +# Directory in which to save checkpoints and logs. (type: , default: out/lora) +out_dir: out/finetune/qlora-gemma-7b + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) +quantize: + +# How many devices/GPUs to use. (type: Union[int, str], default: 1) +devices: 1 + +# The LoRA rank. (type: int, default: 8) +lora_r: 16 + +# The LoRA alpha. (type: int, default: 16) +lora_alpha: 16 + +# The LoRA dropout value. (type: float, default: 0.05) +lora_dropout: 0.1 + +# Whether to apply LoRA to the query weights in attention. (type: bool, default: True) +lora_query: true + +# Whether to apply LoRA to the key weights in attention. (type: bool, default: False) +lora_key: true + +# Whether to apply LoRA to the value weights in attention. (type: bool, default: True) +lora_value: true + +# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) +lora_projection: true + +# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) +lora_mlp: true + +# Whether to apply LoRA to output head in GPT. (type: bool, default: False) +lora_head: true + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + val_split_fraction: 0.03847 + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 800 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) + global_batch_size: 6 + + # Number of samples per data-parallel rank (type: int, default: 4) + micro_batch_size: 1 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 200 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 2 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: float, default: 0.0003) + learning_rate: 0.0002 + + # (type: float, default: 0.02) + weight_decay: 0.0 + + # (type: float, default: 0.9) + beta1: 0.9 + + # (type: float, default: 0.95) + beta2: 0.95 + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 100) + interval: 25 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 diff --git a/config_hub/finetune/gemma-7b/qlora.yaml b/config_hub/finetune/gemma-7b/qlora.yaml new file mode 100644 index 0000000000..7d4a2c634c --- /dev/null +++ b/config_hub/finetune/gemma-7b/qlora.yaml @@ -0,0 +1,122 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/google/gemma-7b + +# Directory in which to save checkpoints and logs. (type: , default: out/lora) +out_dir: out/finetune/qlora-gemma-7b + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) +quantize: bnb.nf4 + +# How many devices/GPUs to use. (type: Union[int, str], default: 1) +devices: 1 + +# The LoRA rank. (type: int, default: 8) +lora_r: 16 + +# The LoRA alpha. (type: int, default: 16) +lora_alpha: 16 + +# The LoRA dropout value. (type: float, default: 0.05) +lora_dropout: 0.1 + +# Whether to apply LoRA to the query weights in attention. (type: bool, default: True) +lora_query: true + +# Whether to apply LoRA to the key weights in attention. (type: bool, default: False) +lora_key: true + +# Whether to apply LoRA to the value weights in attention. (type: bool, default: True) +lora_value: true + +# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) +lora_projection: true + +# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) +lora_mlp: true + +# Whether to apply LoRA to output head in GPT. (type: bool, default: False) +lora_head: true + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + val_split_fraction: 0.03847 + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 800 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) + global_batch_size: 6 + + # Number of samples per data-parallel rank (type: int, default: 4) + micro_batch_size: 1 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 200 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 2 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: float, default: 0.0003) + learning_rate: 0.0002 + + # (type: float, default: 0.02) + weight_decay: 0.0 + + # (type: float, default: 0.9) + beta1: 0.9 + + # (type: float, default: 0.95) + beta2: 0.95 + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 100) + interval: 25 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337