From f203eb3548751b73f9be5bf0798f4bf31adf9ba2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 19 Apr 2024 09:52:21 -0400 Subject: [PATCH] ft --- config_hub/finetune/README.md | 4 + config_hub/finetune/llama-3-8b/full.yaml | 95 +++++++++++++++++ config_hub/finetune/llama-3-8b/lora.yaml | 121 +++++++++++++++++++++ config_hub/finetune/llama-3-8b/qlora.yaml | 123 ++++++++++++++++++++++ 4 files changed, 343 insertions(+) create mode 100644 config_hub/finetune/llama-3-8b/full.yaml create mode 100644 config_hub/finetune/llama-3-8b/lora.yaml create mode 100644 config_hub/finetune/llama-3-8b/qlora.yaml diff --git a/config_hub/finetune/README.md b/config_hub/finetune/README.md index fc82e0854b..bc2a3a4f74 100644 --- a/config_hub/finetune/README.md +++ b/config_hub/finetune/README.md @@ -22,6 +22,10 @@ For more information, see the [Dealing with out-of-memory (OOM) errors](../../tu | llama-2-7b/qlora.yaml | 7B | Alpaca 2k | 4 | 0.814 | 13.68 GB | 512 | 2 | bfloat16 | 45.68 min (A10G) | | llama-2-7b/full.yaml | 7B | Alpaca 2k | 1 | 0.941 | 26.81 GB | 512 | 4 | bfloat16 | 1.78 min (4xA100) | | | | | | | | | | | | +| llama-3-8b/lora.yaml | 8B | Alpaca 2k | 4 | ----- | -------- | 512 | 2 | bfloat16 | ---------------- | +| llama-3-8b/qlora.yaml | 8B | Alpaca 2k | 4 | ----- | -------- | 512 | 2 | bfloat16 | ---------------- | +| llama-3-8b/full.yaml | 8B | Alpaca 2k | 1 | ----- | -------- | 512 | 4 | bfloat16 | ---------------- | +| | | | | | | | | | | | mistral-7b/lora.yaml (v0.1) | 7B | Alpaca 2k | 4 | 0.796 | 20.65 GB | 512 | 2 | bfloat16 | 31.04 min (1xA10G) | | mistral-7b/qlora.yaml (v0.1) | 7B | Alpaca 2k | 4 | 0.803 | 14.29 GB | 512 | 2 | bfloat16 | 44.69 min (1xA10G) | | | | | | | | | | | | diff --git a/config_hub/finetune/llama-3-8b/full.yaml b/config_hub/finetune/llama-3-8b/full.yaml new file mode 100644 index 0000000000..11aebcb155 --- /dev/null +++ b/config_hub/finetune/llama-3-8b/full.yaml @@ -0,0 +1,95 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3-8B + +# Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) +out_dir: out/finetune/full-llama-3-8b + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# How many devices/GPUs to use (type: Union[int, str], default: 1) +devices: 4 + +# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume +# from the latest checkpoint in ``out_dir``. (type: Union[bool, Path], default: False) +resume: false + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) + global_batch_size: 64 + + # Number of samples per data-parallel rank (type: int, default: 1) + micro_batch_size: 4 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 25 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 1 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: float, default: 0.003) + learning_rate: 0.0002 + + # (type: float, default: 0.02) + weight_decay: 0.1 + + # (type: float, default: 0.9) + beta1: 0.9 + + # (type: float, default: 0.95) + beta2: 0.95 + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 600) + interval: 25 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 diff --git a/config_hub/finetune/llama-3-8b/lora.yaml b/config_hub/finetune/llama-3-8b/lora.yaml new file mode 100644 index 0000000000..ac3fc8d8f6 --- /dev/null +++ b/config_hub/finetune/llama-3-8b/lora.yaml @@ -0,0 +1,121 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3-8B + +# Directory in which to save checkpoints and logs. (type: , default: out/lora) +out_dir: out/finetune/lora-llama-3-8b + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) +quantize: + +# How many devices/GPUs to use. (type: Union[int, str], default: 1) +devices: 1 + +# The LoRA rank. (type: int, default: 8) +lora_r: 32 + +# The LoRA alpha. (type: int, default: 16) +lora_alpha: 16 + +# The LoRA dropout value. (type: float, default: 0.05) +lora_dropout: 0.05 + +# Whether to apply LoRA to the query weights in attention. (type: bool, default: True) +lora_query: true + +# Whether to apply LoRA to the key weights in attention. (type: bool, default: False) +lora_key: false + +# Whether to apply LoRA to the value weights in attention. (type: bool, default: True) +lora_value: true + +# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) +lora_projection: false + +# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) +lora_mlp: false + +# Whether to apply LoRA to output head in GPT. (type: bool, default: False) +lora_head: false + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) + global_batch_size: 8 + + # Number of samples per data-parallel rank (type: int, default: 4) + micro_batch_size: 2 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 10 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 4 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: float, default: 0.0003) + learning_rate: 0.0002 + + # (type: float, default: 0.02) + weight_decay: 0.0 + + # (type: float, default: 0.9) + beta1: 0.9 + + # (type: float, default: 0.95) + beta2: 0.95 + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 100) + interval: 100 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 diff --git a/config_hub/finetune/llama-3-8b/qlora.yaml b/config_hub/finetune/llama-3-8b/qlora.yaml new file mode 100644 index 0000000000..5b0f08059d --- /dev/null +++ b/config_hub/finetune/llama-3-8b/qlora.yaml @@ -0,0 +1,123 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3-8B + +# Directory in which to save checkpoints and logs. (type: , default: out/lora) +out_dir: out/finetune/qlora-llama3-8b + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) +quantize: bnb.nf4 + +# How many devices/GPUs to use. (type: Union[int, str], default: 1) +devices: 1 + +# The LoRA rank. (type: int, default: 8) +lora_r: 32 + +# The LoRA alpha. (type: int, default: 16) +lora_alpha: 16 + +# The LoRA dropout value. (type: float, default: 0.05) +lora_dropout: 0.05 + +# Whether to apply LoRA to the query weights in attention. (type: bool, default: True) +lora_query: true + +# Whether to apply LoRA to the key weights in attention. (type: bool, default: False) +lora_key: false + +# Whether to apply LoRA to the value weights in attention. (type: bool, default: True) +lora_value: true + +# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) +lora_projection: false + +# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) +lora_mlp: false + +# Whether to apply LoRA to output head in GPT. (type: bool, default: False) +lora_head: false + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + val_split_fraction: 0.05 + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + download_dir: data/alpaca2k + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) + global_batch_size: 8 + + # Number of samples per data-parallel rank (type: int, default: 4) + micro_batch_size: 2 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 10 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 4 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null) + tie_embeddings: + + # (type: float, default: 0.0003) + learning_rate: 0.0002 + + # (type: float, default: 0.02) + weight_decay: 0.0 + + # (type: float, default: 0.9) + beta1: 0.9 + + # (type: float, default: 0.95) + beta2: 0.95 + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 100) + interval: 100 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337