diff --git a/config_hub/finetune/falcon-7b/lora.yaml b/config_hub/finetune/falcon-7b/lora.yaml index c45b0fed94..83718b95b2 100644 --- a/config_hub/finetune/falcon-7b/lora.yaml +++ b/config_hub/finetune/falcon-7b/lora.yaml @@ -84,18 +84,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -122,3 +110,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/falcon-7b/qlora.yaml b/config_hub/finetune/falcon-7b/qlora.yaml index 33ab9d9fc3..d2b0a4000d 100644 --- a/config_hub/finetune/falcon-7b/qlora.yaml +++ b/config_hub/finetune/falcon-7b/qlora.yaml @@ -86,18 +86,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -124,3 +112,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/gemma-2b/full.yaml b/config_hub/finetune/gemma-2b/full.yaml index 879f1afee9..27e5c79576 100644 --- a/config_hub/finetune/gemma-2b/full.yaml +++ b/config_hub/finetune/gemma-2b/full.yaml @@ -55,18 +55,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -93,3 +81,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/gemma-2b/lora.yaml b/config_hub/finetune/gemma-2b/lora.yaml index 91af82800d..41239938a0 100644 --- a/config_hub/finetune/gemma-2b/lora.yaml +++ b/config_hub/finetune/gemma-2b/lora.yaml @@ -85,18 +85,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.2 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -123,3 +111,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/gemma-2b/qlora.yaml b/config_hub/finetune/gemma-2b/qlora.yaml index 159ae2cc86..f927931eac 100644 --- a/config_hub/finetune/gemma-2b/qlora.yaml +++ b/config_hub/finetune/gemma-2b/qlora.yaml @@ -85,18 +85,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -123,3 +111,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/gemma-7b/lora.yaml b/config_hub/finetune/gemma-7b/lora.yaml index 59120c5d0b..171a95fd4a 100644 --- a/config_hub/finetune/gemma-7b/lora.yaml +++ b/config_hub/finetune/gemma-7b/lora.yaml @@ -85,18 +85,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -123,3 +111,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/gemma-7b/qlora.yaml b/config_hub/finetune/gemma-7b/qlora.yaml index 556fba0cf5..dfc04df63f 100644 --- a/config_hub/finetune/gemma-7b/qlora.yaml +++ b/config_hub/finetune/gemma-7b/qlora.yaml @@ -85,18 +85,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -123,3 +111,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-2-7b/full.yaml b/config_hub/finetune/llama-2-7b/full.yaml index 99de788c74..7705daf734 100644 --- a/config_hub/finetune/llama-2-7b/full.yaml +++ b/config_hub/finetune/llama-2-7b/full.yaml @@ -58,18 +58,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.1 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -96,3 +84,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-2-7b/lora.yaml b/config_hub/finetune/llama-2-7b/lora.yaml index 594b2f924d..f736aefa6c 100644 --- a/config_hub/finetune/llama-2-7b/lora.yaml +++ b/config_hub/finetune/llama-2-7b/lora.yaml @@ -84,18 +84,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -122,3 +110,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-2-7b/qlora.yaml b/config_hub/finetune/llama-2-7b/qlora.yaml index 106b9422f4..1ce5273db7 100644 --- a/config_hub/finetune/llama-2-7b/qlora.yaml +++ b/config_hub/finetune/llama-2-7b/qlora.yaml @@ -86,18 +86,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -124,3 +112,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-3-8b/full.yaml b/config_hub/finetune/llama-3-8b/full.yaml index e06d037710..d106d6936e 100644 --- a/config_hub/finetune/llama-3-8b/full.yaml +++ b/config_hub/finetune/llama-3-8b/full.yaml @@ -58,18 +58,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.1 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -96,3 +84,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.1 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-3-8b/lora.yaml b/config_hub/finetune/llama-3-8b/lora.yaml index 1d874a0690..5b20d70169 100644 --- a/config_hub/finetune/llama-3-8b/lora.yaml +++ b/config_hub/finetune/llama-3-8b/lora.yaml @@ -84,18 +84,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -122,3 +110,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/llama-3-8b/qlora.yaml b/config_hub/finetune/llama-3-8b/qlora.yaml index 33a0fc98be..31cc2ec93e 100644 --- a/config_hub/finetune/llama-3-8b/qlora.yaml +++ b/config_hub/finetune/llama-3-8b/qlora.yaml @@ -86,18 +86,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -124,3 +112,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/mistral-7b-v0.2/lora.yaml b/config_hub/finetune/mistral-7b-v0.2/lora.yaml index f56e34c525..eb4228a57e 100644 --- a/config_hub/finetune/mistral-7b-v0.2/lora.yaml +++ b/config_hub/finetune/mistral-7b-v0.2/lora.yaml @@ -84,18 +84,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -122,3 +110,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/mistral-7b-v0.2/qlora.yaml b/config_hub/finetune/mistral-7b-v0.2/qlora.yaml index b648b24d72..e36e5d6925 100644 --- a/config_hub/finetune/mistral-7b-v0.2/qlora.yaml +++ b/config_hub/finetune/mistral-7b-v0.2/qlora.yaml @@ -86,18 +86,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -124,3 +112,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/mistral-7b/lora.yaml b/config_hub/finetune/mistral-7b/lora.yaml index e991ec424e..10e13d935f 100644 --- a/config_hub/finetune/mistral-7b/lora.yaml +++ b/config_hub/finetune/mistral-7b/lora.yaml @@ -84,18 +84,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -122,3 +110,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/mistral-7b/qlora.yaml b/config_hub/finetune/mistral-7b/qlora.yaml index e43b745bb8..a985c6770e 100644 --- a/config_hub/finetune/mistral-7b/qlora.yaml +++ b/config_hub/finetune/mistral-7b/qlora.yaml @@ -86,18 +86,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -124,3 +112,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/phi-2/full.yaml b/config_hub/finetune/phi-2/full.yaml index 5b302a48ac..9509152c08 100644 --- a/config_hub/finetune/phi-2/full.yaml +++ b/config_hub/finetune/phi-2/full.yaml @@ -58,18 +58,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.1 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -96,3 +84,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.1 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/phi-2/lora.yaml b/config_hub/finetune/phi-2/lora.yaml index 2571bc02d0..81da9b0826 100644 --- a/config_hub/finetune/phi-2/lora.yaml +++ b/config_hub/finetune/phi-2/lora.yaml @@ -85,18 +85,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -123,3 +111,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/phi-2/qlora.yaml b/config_hub/finetune/phi-2/qlora.yaml index d48d910939..0c6e91df88 100644 --- a/config_hub/finetune/phi-2/qlora.yaml +++ b/config_hub/finetune/phi-2/qlora.yaml @@ -85,18 +85,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -123,3 +111,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/stablelm-base-alpha-3b/full.yaml b/config_hub/finetune/stablelm-base-alpha-3b/full.yaml index c196fcc017..ecf3d1a25d 100644 --- a/config_hub/finetune/stablelm-base-alpha-3b/full.yaml +++ b/config_hub/finetune/stablelm-base-alpha-3b/full.yaml @@ -55,18 +55,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.1 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -93,3 +81,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.1 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml b/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml index 6e52ea2175..e85dbfd4a4 100644 --- a/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml +++ b/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml @@ -84,18 +84,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -122,3 +110,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml b/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml index ebd2f098eb..2980a7013e 100644 --- a/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml +++ b/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml @@ -86,18 +86,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -124,3 +112,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/tiny-llama/full.yaml b/config_hub/finetune/tiny-llama/full.yaml index fe1d1ef99d..e85f928337 100644 --- a/config_hub/finetune/tiny-llama/full.yaml +++ b/config_hub/finetune/tiny-llama/full.yaml @@ -55,18 +55,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -93,3 +81,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/tiny-llama/lora.yaml b/config_hub/finetune/tiny-llama/lora.yaml index c42ff28ff3..f140a8d26d 100644 --- a/config_hub/finetune/tiny-llama/lora.yaml +++ b/config_hub/finetune/tiny-llama/lora.yaml @@ -85,18 +85,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -123,3 +111,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/tiny-llama/qlora.yaml b/config_hub/finetune/tiny-llama/qlora.yaml index 7e80e4d0ca..bcf8112a01 100644 --- a/config_hub/finetune/tiny-llama/qlora.yaml +++ b/config_hub/finetune/tiny-llama/qlora.yaml @@ -85,18 +85,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) tie_embeddings: - # (type: float, default: 0.0003) - learning_rate: 0.0002 - - # (type: float, default: 0.02) - weight_decay: 0.0 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: null) max_norm: @@ -123,3 +111,21 @@ logger_name: csv # The random seed to use for reproducibility. (type: int, default: 1337) seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/pretrain/debug.yaml b/config_hub/pretrain/debug.yaml index e89dda3cc9..ab848aa341 100644 --- a/config_hub/pretrain/debug.yaml +++ b/config_hub/pretrain/debug.yaml @@ -58,18 +58,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False) tie_embeddings: - # (type: float, default: 0.0004) - learning_rate: 6e-4 - - # (type: float, default: 0.1) - weight_decay: 0.1 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: 1.0) max_norm: 1.0 @@ -91,6 +79,24 @@ eval: # Whether to evaluate on the validation set at the beginning of the training initial_validation: false +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 6e-4 + + # (type: float, default: 0.01) + weight_decay: 0.1 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 + # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto) devices: auto diff --git a/config_hub/pretrain/tinyllama.yaml b/config_hub/pretrain/tinyllama.yaml index e2418a5b17..5dc8bf64b3 100644 --- a/config_hub/pretrain/tinyllama.yaml +++ b/config_hub/pretrain/tinyllama.yaml @@ -58,18 +58,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False) tie_embeddings: - # (type: float, default: 0.0004) - learning_rate: 4e-4 - - # (type: float, default: 0.1) - weight_decay: 0.1 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: 1.0) max_norm: 1.0 @@ -91,6 +79,24 @@ eval: # Whether to evaluate on the validation set at the beginning of the training initial_validation: false +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 4e-4 + + # (type: float, default: 0.01) + weight_decay: 0.1 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 + # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto) devices: auto diff --git a/config_hub/pretrain/tinystories.yaml b/config_hub/pretrain/tinystories.yaml index 8ed53a09d7..ba2b03d6e2 100644 --- a/config_hub/pretrain/tinystories.yaml +++ b/config_hub/pretrain/tinystories.yaml @@ -74,18 +74,6 @@ train: # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False) tie_embeddings: true - # (type: float, default: 0.0004) - learning_rate: 0.0005 - - # (type: float, default: 0.1) - weight_decay: 0.1 - - # (type: float, default: 0.9) - beta1: 0.9 - - # (type: float, default: 0.95) - beta2: 0.95 - # (type: Optional[float], default: 1.0) max_norm: 1.0 @@ -107,6 +95,24 @@ eval: # Whether to evaluate on the validation set at the beginning of the training initial_validation: false +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0005 + + # (type: float, default: 0.01) + weight_decay: 0.1 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 + # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto) devices: auto diff --git a/extensions/thunder/pretrain.py b/extensions/thunder/pretrain.py index 6aa77a745f..757a3ecb99 100644 --- a/extensions/thunder/pretrain.py +++ b/extensions/thunder/pretrain.py @@ -8,7 +8,7 @@ from datetime import timedelta from functools import partial from pathlib import Path -from typing import Any, Callable, Optional, Tuple, Union, List +from typing import Any, Callable, Optional, Tuple, Union, List, Dict import lightning as L import torch @@ -30,6 +30,7 @@ choose_logger, chunked_cross_entropy, copy_config_files, + instantiate_torch_optimizer, num_parameters, parse_devices, reset_parameters, @@ -55,16 +56,13 @@ def setup( global_batch_size=512, micro_batch_size=4, max_tokens=int(3e12), # 3 trillion - learning_rate=4e-4, - weight_decay=1e-1, - beta1=0.9, - beta2=0.95, max_norm=1.0, min_lr=4e-5, lr_warmup_steps=2000, tie_embeddings=False, ), eval: EvalArgs = EvalArgs(interval=1000, max_iters=100), + optimizer: Union[str, Dict] = "AdamW", devices: Union[int, str] = "auto", tokenizer_dir: Optional[Path] = None, logger_name: Literal["wandb", "tensorboard", "csv"] = "tensorboard", @@ -89,6 +87,7 @@ def setup( data: Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``. train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details. eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details. + optimizer: An optimizer name (such as "AdamW") or config. devices: How many devices/GPUs to use. Uses all GPUs by default. tokenizer_dir: Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data module require this. @@ -157,6 +156,7 @@ def setup( tokenizer, train, eval, + optimizer, compiler, ) @@ -174,6 +174,7 @@ def main( tokenizer: Optional[Tokenizer], train: TrainArgs, eval: EvalArgs, + optimizer: Union[str, Dict], compiler: Optional[Literal["thunder", "torch"]], ) -> None: validate_args(train, eval, initial_checkpoint_dir, resume) @@ -201,13 +202,7 @@ def main( if compiler == "thunder": # avoid `Tensor.register_hook` which is unsupported model._register_backward_hook = lambda *_: None - optimizer = torch.optim.AdamW( - model.parameters(), - lr=train.learning_rate, - weight_decay=train.weight_decay, - betas=(train.beta1, train.beta2), - fused=True, - ) + optimizer = instantiate_torch_optimizer(optimizer, model.parameters()) optimizer = fabric.setup_optimizers(optimizer) train_dataloader, val_dataloader = get_dataloaders(fabric, data, tokenizer, train, model.max_seq_length) @@ -231,7 +226,7 @@ def main( fabric.load(resume, state) train_time = time.perf_counter() - fit(fabric, devices, state, train_dataloader, val_dataloader, out_dir, tokenizer_dir, train, eval) + fit(fabric, devices, state, train_dataloader, val_dataloader, out_dir, tokenizer_dir, train, eval, optimizer) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") # Save final checkpoint @@ -251,6 +246,7 @@ def fit( tokenizer_dir: Optional[Path], train: TrainArgs, eval: EvalArgs, + optimizer: Union[str, Dict], ) -> None: model = state["model"] optimizer = state["optimizer"] @@ -288,7 +284,7 @@ def fit( break # determine and set the learning rate for this iteration - lr = get_lr(train.learning_rate, state["iter_num"], warmup_iters, max_iters, train.min_lr) + lr = get_lr(optimizer.param_groups[0]["lr"], state["iter_num"], warmup_iters, max_iters, train.min_lr) for param_group in optimizer.param_groups: param_group["lr"] = lr diff --git a/litgpt/__main__.py b/litgpt/__main__.py index 821c1f5801..2324ce0b30 100644 --- a/litgpt/__main__.py +++ b/litgpt/__main__.py @@ -118,7 +118,10 @@ def main() -> None: if k == "help": continue subsubcommand_parser = _new_parser() - subsubcommand_parser.add_function_arguments(v["fn"]) + if subcommand in ("finetune", "pretrain"): + subsubcommand_parser.add_subclass_arguments(torch.optim.Optimizer, "optimizer", instantiate=False, fail_untyped=False, skip={"params"}) + subsubcommand_parser.set_defaults({"optimizer": "AdamW"}) + subsubcommand_parser.add_function_arguments(v["fn"], skip={"optimizer"}) subcommands.add_subcommand(k, subsubcommand_parser, help=v["help"]) args = root_parser.parse_args() @@ -140,6 +143,10 @@ def main() -> None: torch.set_float32_matmul_precision("high") + # dictionary unpacking on the jsonargparse namespace seems to flatten inner namespaces. i dont know if that's a bug or intended + # but we can simply convert to dict at this point + kwargs = kwargs.as_dict() + fn(**kwargs) diff --git a/litgpt/args.py b/litgpt/args.py index 7e277fe9e6..e3bac05ef2 100644 --- a/litgpt/args.py +++ b/litgpt/args.py @@ -33,10 +33,6 @@ class TrainArgs: """Whether to tie the embedding weights with the language modeling head weights""" # Optimization args - learning_rate: float = 1e-3 - weight_decay: float = 0.02 - beta1: float = 0.9 - beta2: float = 0.95 max_norm: Optional[float] = None min_lr: float = 6e-5 diff --git a/litgpt/finetune/adapter.py b/litgpt/finetune/adapter.py index 2ec71784e7..3f1030a229 100644 --- a/litgpt/finetune/adapter.py +++ b/litgpt/finetune/adapter.py @@ -29,6 +29,8 @@ copy_config_files, get_default_supported_precision, init_out_dir, + instantiate_torch_optimizer, + instantiate_bnb_optimizer, load_checkpoint, num_parameters, parse_devices, @@ -50,10 +52,10 @@ def setup( micro_batch_size=1, lr_warmup_steps=100, epochs=5, - learning_rate=1e-3, max_seq_length=None, ), eval: EvalArgs = EvalArgs(interval=100, max_new_tokens=100, max_iters=100), + optimizer: Union[str, Dict] = "AdamW", logger_name: Literal["wandb", "tensorboard", "csv"] = "csv", seed: int = 1337, ) -> None: @@ -69,6 +71,7 @@ def setup( data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details. eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details. + optimizer: An optimizer name (such as "AdamW") or config. logger_name: The name of the logger to send metrics to. seed: The random seed to use for reproducibility. """ @@ -109,7 +112,7 @@ def setup( strategy = "auto" fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins) - fabric.launch(main, devices, seed, config, data, checkpoint_dir, out_dir, train, eval) + fabric.launch(main, devices, seed, config, data, checkpoint_dir, out_dir, train, eval, optimizer) def main( @@ -122,6 +125,7 @@ def main( out_dir: Path, train: TrainArgs, eval: EvalArgs, + optimizer: Union[str, Dict], ) -> None: validate_args(train, eval) @@ -146,14 +150,10 @@ def main( model = fabric.setup_module(model) if isinstance(fabric.strategy.precision, BitsandbytesPrecision): - import bitsandbytes as bnb - - optimizer_cls = bnb.optim.PagedAdamW + optimizer = instantiate_bnb_optimizer(optimizer, model.parameters()) else: - optimizer_cls = torch.optim.AdamW - optimizer = optimizer_cls( - model.parameters(), lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2) - ) + optimizer = instantiate_torch_optimizer(optimizer, model.parameters()) + optimizer = fabric.setup_optimizers(optimizer) scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps) diff --git a/litgpt/finetune/adapter_v2.py b/litgpt/finetune/adapter_v2.py index 86526a58e5..785668939e 100644 --- a/litgpt/finetune/adapter_v2.py +++ b/litgpt/finetune/adapter_v2.py @@ -29,6 +29,8 @@ copy_config_files, get_default_supported_precision, init_out_dir, + instantiate_torch_optimizer, + instantiate_bnb_optimizer, load_checkpoint, num_parameters, parse_devices, @@ -50,10 +52,10 @@ def setup( micro_batch_size=1, lr_warmup_steps=100, epochs=5, - learning_rate=1e-3, max_seq_length=None, ), eval: EvalArgs = EvalArgs(interval=100, max_new_tokens=100, max_iters=100), + optimizer: Union[str, Dict] = "AdamW", logger_name: Literal["wandb", "tensorboard", "csv"] = "csv", seed: int = 1337, ) -> None: @@ -69,6 +71,7 @@ def setup( data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details. eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details. + optimizer: An optimizer name (such as "AdamW") or config. logger_name: The name of the logger to send metrics to. seed: The random seed to use for reproducibility. """ @@ -109,7 +112,7 @@ def setup( strategy = "auto" fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins) - fabric.launch(main, devices, seed, config, data, checkpoint_dir, out_dir, train, eval) + fabric.launch(main, devices, seed, config, data, checkpoint_dir, out_dir, train, eval, optimizer) def main( @@ -122,6 +125,7 @@ def main( out_dir: Path, train: TrainArgs, eval: EvalArgs, + optimizer: Union[str, Dict], ) -> None: validate_args(train, eval) @@ -146,14 +150,10 @@ def main( model = fabric.setup_module(model) if isinstance(fabric.strategy.precision, BitsandbytesPrecision): - import bitsandbytes as bnb - - optimizer_cls = bnb.optim.PagedAdamW + optimizer = instantiate_bnb_optimizer(optimizer, model.parameters()) else: - optimizer_cls = torch.optim.AdamW - optimizer = optimizer_cls( - model.parameters(), lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2) - ) + optimizer = instantiate_torch_optimizer(optimizer, model.parameters()) + optimizer = fabric.setup_optimizers(optimizer) scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps) diff --git a/litgpt/finetune/full.py b/litgpt/finetune/full.py index bba6d0ea61..cf32ae501d 100644 --- a/litgpt/finetune/full.py +++ b/litgpt/finetune/full.py @@ -28,6 +28,7 @@ get_default_supported_precision, load_checkpoint, init_out_dir, + instantiate_torch_optimizer, num_parameters, parse_devices, save_hyperparameters, @@ -48,10 +49,10 @@ def setup( micro_batch_size=1, lr_warmup_steps=100, epochs=5, - learning_rate=3e-3, max_seq_length=None, ), eval: EvalArgs = EvalArgs(interval=600, max_new_tokens=100, max_iters=100), + optimizer: Union[str, Dict] = "AdamW", logger_name: Literal["wandb", "tensorboard", "csv"] = "csv", seed: int = 1337, ) -> None: @@ -68,12 +69,14 @@ def setup( data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details. eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details. + optimizer: An optimizer name (such as "AdamW") or config. logger_name: The name of the logger to send metrics to. seed: The random seed to use for reproducibility. """ - pprint(locals()) + data = Alpaca() if data is None else data + devices = parse_devices(devices) out_dir = init_out_dir(out_dir) @@ -97,7 +100,7 @@ def setup( strategy = "auto" fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger) - fabric.launch(main, devices, resume, seed, config, data, checkpoint_dir, out_dir, train, eval) + fabric.launch(main, devices, resume, seed, config, data, checkpoint_dir, out_dir, train, eval, optimizer) def main( @@ -111,6 +114,7 @@ def main( out_dir: Path, train: TrainArgs, eval: EvalArgs, + optimizer: Union[str, Dict], ) -> None: validate_args(train, eval) @@ -131,9 +135,8 @@ def main( fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}") model = fabric.setup(model) - optimizer = torch.optim.AdamW( - model.parameters(), lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2) - ) + + optimizer = instantiate_torch_optimizer(optimizer, model.parameters()) optimizer = fabric.setup_optimizers(optimizer) scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps) state = {"model": model, "optimizer": optimizer, "scheduler": scheduler, "iter_num": 0, "step_count": 0} @@ -371,4 +374,3 @@ def validate_args(train: TrainArgs, eval: EvalArgs) -> None: issues.append(f"{__file__} requires either epochs or max_steps to be set. This is set in {train}") if issues: raise ValueError("\n".join(issues)) - diff --git a/litgpt/finetune/lora.py b/litgpt/finetune/lora.py index a2c3ef07b2..5f5e12dcf9 100644 --- a/litgpt/finetune/lora.py +++ b/litgpt/finetune/lora.py @@ -31,6 +31,8 @@ get_default_supported_precision, load_checkpoint, init_out_dir, + instantiate_torch_optimizer, + instantiate_bnb_optimizer, num_parameters, parse_devices, save_hyperparameters, @@ -60,10 +62,10 @@ def setup( micro_batch_size=1, lr_warmup_steps=100, epochs=5, - learning_rate=3e-4, max_seq_length=None, ), eval: EvalArgs = EvalArgs(interval=100, max_new_tokens=100, max_iters=100), + optimizer: Union[str, Dict] = "AdamW", logger_name: Literal["wandb", "tensorboard", "csv"] = "csv", seed: int = 1337, ) -> None: @@ -88,6 +90,7 @@ def setup( data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details. eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details. + optimizer: An optimizer name (such as "AdamW") or config. logger_name: The name of the logger to send metrics to. seed: The random seed to use for reproducibility. """ @@ -139,7 +142,7 @@ def setup( strategy = "auto" fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins) - fabric.launch(main, devices, seed, config, data, checkpoint_dir, out_dir, train, eval) + fabric.launch(main, devices, seed, config, data, checkpoint_dir, out_dir, train, eval, optimizer) def main( @@ -152,6 +155,7 @@ def main( out_dir: Path, train: TrainArgs, eval: EvalArgs, + optimizer: Union[str, Dict], ) -> None: validate_args(train, eval) @@ -176,14 +180,10 @@ def main( model = fabric.setup_module(model) if isinstance(fabric.strategy.precision, BitsandbytesPrecision): - import bitsandbytes as bnb - - optimizer_cls = bnb.optim.PagedAdamW + optimizer = instantiate_bnb_optimizer(optimizer, model.parameters()) else: - optimizer_cls = torch.optim.AdamW - optimizer = optimizer_cls( - model.parameters(), lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2) - ) + optimizer = instantiate_torch_optimizer(optimizer, model.parameters()) + optimizer = fabric.setup_optimizers(optimizer) scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps) diff --git a/litgpt/pretrain.py b/litgpt/pretrain.py index 56f4ac9b3c..198a6673c9 100644 --- a/litgpt/pretrain.py +++ b/litgpt/pretrain.py @@ -6,7 +6,7 @@ from datetime import timedelta from functools import partial from pathlib import Path -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Union, Dict import lightning as L import torch @@ -23,7 +23,6 @@ from litgpt.data import DataModule, TinyLlama from litgpt.model import GPT, Block, CausalSelfAttention, Config, LLaMAMLP from litgpt.utils import ( - CLI, CycleIterator, capture_hparams, choose_logger, @@ -31,6 +30,7 @@ copy_config_files, get_default_supported_precision, init_out_dir, + instantiate_torch_optimizer, num_parameters, parse_devices, reset_parameters, @@ -53,16 +53,13 @@ def setup( global_batch_size=512, micro_batch_size=4, max_tokens=int(3e12), # 3 trillion - learning_rate=4e-4, - weight_decay=1e-1, - beta1=0.9, - beta2=0.95, max_norm=1.0, min_lr=4e-5, lr_warmup_steps=2000, tie_embeddings=False, ), eval: EvalArgs = EvalArgs(interval=1000, max_iters=100), + optimizer: Union[str, Dict] = "AdamW", devices: Union[int, str] = "auto", tokenizer_dir: Optional[Path] = None, logger_name: Literal["wandb", "tensorboard", "csv"] = "tensorboard", @@ -85,6 +82,8 @@ def setup( data: Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``. train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details. eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details. + optimizer: An optimizer name (such as "AdamW") or config. + devices: How many devices/GPUs to use. Uses all GPUs by default. tokenizer_dir: Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data module require this. @@ -133,6 +132,7 @@ def setup( tokenizer, train, eval, + optimizer, ) @@ -149,6 +149,7 @@ def main( tokenizer: Optional[Tokenizer], train: TrainArgs, eval: EvalArgs, + optimizer: Union[str, Dict], ) -> None: validate_args(train, eval, initial_checkpoint_dir, resume) @@ -174,13 +175,8 @@ def main( model = torch.compile(model) model = fabric.setup(model) - optimizer = torch.optim.AdamW( - model.parameters(), - lr=train.learning_rate, - weight_decay=train.weight_decay, - betas=(train.beta1, train.beta2), - fused=fabric.device.type == "cuda", - ) + extra_kwargs = {"fused": fabric.device.type == "cuda"} + optimizer = instantiate_torch_optimizer(optimizer, model.parameters(), **extra_kwargs) optimizer = fabric.setup_optimizers(optimizer) train_dataloader, val_dataloader = get_dataloaders(fabric, data, tokenizer, train, model.max_seq_length) @@ -266,7 +262,7 @@ def fit( break # determine and set the learning rate for this iteration - lr = get_lr(train.learning_rate, state["iter_num"], warmup_iters, max_iters, train.min_lr) + lr = get_lr(optimizer.param_groups[0]["lr"], state["iter_num"], warmup_iters, max_iters, train.min_lr) for param_group in optimizer.param_groups: param_group["lr"] = lr @@ -442,4 +438,3 @@ def validate_args(train: TrainArgs, eval: EvalArgs, initial_checkpoint_dir, resu issues.append("Can't provide both `--resume` and `--initial_checkpoint_dir`. Choose one.") if issues: raise ValueError("\n".join(issues)) - diff --git a/litgpt/utils.py b/litgpt/utils.py index 18aea56d64..9225af8911 100644 --- a/litgpt/utils.py +++ b/litgpt/utils.py @@ -21,6 +21,7 @@ from lightning.fabric.strategies import FSDPStrategy from lightning.fabric.utilities.load import _lazy_load as lazy_load from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.cli import instantiate_class from torch.serialization import normalize_storage_type from typing_extensions import Self @@ -486,3 +487,32 @@ def choose_logger( if logger_name == "wandb": return WandbLogger(project=name, resume=resume, **kwargs) raise ValueError(f"`--logger_name={logger_name}` is not a valid option. Choose from 'csv', 'tensorboard', 'wandb'.") + + +def get_argument_names(cls): + sig = inspect.signature(cls.__init__) + return {name for name, param in sig.parameters.items() + if param.kind in [inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY]} + + +def instantiate_bnb_optimizer(optimizer, model_parameters): + if (isinstance(optimizer, str) and "AdamW" not in optimizer) or (isinstance(optimizer, dict) and "AdamW" not in optimizer.get("class_path", "")): + raise ValueError("The chosen quantization format only supports the AdamW optimizer.") + + import bitsandbytes as bnb + if isinstance(optimizer, str): + optimizer = bnb.optim.PagedAdamW(model_parameters) + else: + optim_args = get_argument_names(bnb.optim.PagedAdamW) + allowed_kwargs = {key: optimizer["init_args"][key] for key in optim_args & optimizer["init_args"].keys()} + optimizer = bnb.optim.PagedAdamW(model_parameters, **allowed_kwargs) + return optimizer + + +def instantiate_torch_optimizer(optimizer, model_parameters, **kwargs): + if isinstance(optimizer, str): + optimizer_cls = getattr(torch.optim, optimizer) + optimizer = optimizer_cls(model_parameters, **kwargs) + else: + optimizer = instantiate_class(model_parameters, optimizer, **kwargs) + return optimizer diff --git a/tests/test_config_hub.py b/tests/test_config_hub.py index 4ad634ca9b..8987cc0071 100644 --- a/tests/test_config_hub.py +++ b/tests/test_config_hub.py @@ -16,7 +16,7 @@ ("litgpt/pretrain.py", "pretrain/tinystories.yaml"), ( "litgpt/pretrain.py", - "https://raw.githubusercontent.com/Lightning-AI/litgpt/main/config_hub/pretrain/tinystories.yaml", + "https://raw.githubusercontent.com/Lightning-AI/litgpt/eb6ec386a9ffc8214f6435cb8345789b3b31a267/config_hub/pretrain/tinystories.yaml", ), ] diff --git a/tests/test_thunder_pretrain.py b/tests/test_thunder_pretrain.py index 30f9d71afb..e941ad7949 100644 --- a/tests/test_thunder_pretrain.py +++ b/tests/test_thunder_pretrain.py @@ -37,6 +37,7 @@ def test_pretrain(tmp_path, monkeypatch): out_dir=out_dir, train=TrainArgs(global_batch_size=2, max_tokens=16, save_interval=1, micro_batch_size=1, max_norm=1.0), eval=EvalArgs(interval=1, max_iters=1), + optimizer="AdamW", ) out_dir_contents = set(os.listdir(out_dir)) diff --git a/tests/test_utils.py b/tests/test_utils.py index 554929c77f..ebb018ef5a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -16,6 +16,7 @@ from lightning.fabric.loggers import CSVLogger, TensorBoardLogger from lightning.fabric.plugins import BitsandbytesPrecision from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.cli import instantiate_class from lightning_utilities.core.imports import RequirementCache from litgpt import GPT @@ -29,8 +30,11 @@ chunked_cross_entropy, copy_config_files, find_multiple, + get_argument_names, incremental_save, init_out_dir, + instantiate_bnb_optimizer, + instantiate_torch_optimizer, num_parameters, parse_devices, save_hyperparameters, @@ -306,4 +310,46 @@ def test_init_out_dir(tmp_path): with mock.patch.dict(os.environ, {"LIGHTNING_ARTIFACTS_DIR": "prefix"}): assert init_out_dir(relative_path) == Path("prefix") / relative_path - assert init_out_dir(absolute_path) == absolute_path \ No newline at end of file + assert init_out_dir(absolute_path) == absolute_path + + +@pytest.fixture +def model_parameters(): + return [torch.nn.Parameter(torch.randn(2, 2))] + + +def test_instantiate_bnb_optimizer_with_str(model_parameters): + import bitsandbytes as bnb + with mock.patch("litgpt.utils.get_argument_names", return_value={"lr", "eps", "weight_decay"}): + optimizer = instantiate_bnb_optimizer("AdamW", model_parameters) + assert isinstance(optimizer, bnb.optim.adamw.PagedAdamW) + + +def test_instantiate_bnb_optimizer_with_dict(model_parameters): + import bitsandbytes as bnb + optimizer_dict = {"class_path": "AdamW", "init_args": {"lr": 0.01}} + with mock.patch("litgpt.utils.get_argument_names", return_value={"lr", "eps", "weight_decay"}): + optimizer = instantiate_bnb_optimizer(optimizer_dict, model_parameters) + assert isinstance(optimizer, bnb.optim.adamw.PagedAdamW) + assert optimizer.param_groups[0]["lr"] == 0.01 + + +def test_instantiate_bnb_optimizer_with_invalid_str(model_parameters): + with pytest.raises(ValueError, match="only supports the AdamW"): + instantiate_bnb_optimizer("SGD", model_parameters) + + +def test_instantiate_torch_optimizer_with_str(model_parameters): + with mock.patch("litgpt.utils.instantiate_class") as mock_instantiate_class: + mock_instantiate_class.return_value = torch.optim.Adam(model_parameters, lr=0.01) + optimizer = instantiate_torch_optimizer("Adam", model_parameters, lr=0.01) + assert isinstance(optimizer, torch.optim.Adam) + assert optimizer.param_groups[0]["lr"] == 0.01 + + +def test_instantiate_torch_optimizer_with_class(model_parameters): + with mock.patch("litgpt.utils.instantiate_class") as mock_instantiate_class: + mock_instantiate_class.return_value = torch.optim.Adam(model_parameters, lr=0.02) + optimizer = instantiate_torch_optimizer(torch.optim.Adam, model_parameters, lr=0.02) + assert isinstance(optimizer, torch.optim.Adam) + assert optimizer.param_groups[0]["lr"] == 0.02 diff --git a/tutorials/pretrain.md b/tutorials/pretrain.md index ce8f92b0e7..9b83d93caa 100644 --- a/tutorials/pretrain.md +++ b/tutorials/pretrain.md @@ -79,8 +79,8 @@ litgpt pretrain \ --tokenizer_dir checkpoints/EleutherAI/pythia-14m \ --data TextFiles \ --data.train_data_path custom_pretraining_data \ - --train.learning_rate 0.005 \ --train.lr_warmup_steps=200 + --optimizer.lr 0.005 ``` @@ -117,8 +117,8 @@ litgpt pretrain \ --out_dir new_phi-2_checkpoint \ --data TextFiles \ --data.train_data_path custom_pretraining_data \ - --train.learning_rate 0.005 \ --train.lr_warmup_steps=200 + --optimizer.lr 0.005 ```