diff --git a/config_hub/finetune/falcon-7b/lora.yaml b/config_hub/finetune/falcon-7b/lora.yaml
index c45b0fed94..83718b95b2 100644
--- a/config_hub/finetune/falcon-7b/lora.yaml
+++ b/config_hub/finetune/falcon-7b/lora.yaml
@@ -84,18 +84,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -122,3 +110,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/falcon-7b/qlora.yaml b/config_hub/finetune/falcon-7b/qlora.yaml
index 33ab9d9fc3..d2b0a4000d 100644
--- a/config_hub/finetune/falcon-7b/qlora.yaml
+++ b/config_hub/finetune/falcon-7b/qlora.yaml
@@ -86,18 +86,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -124,3 +112,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/gemma-2b/full.yaml b/config_hub/finetune/gemma-2b/full.yaml
index 879f1afee9..27e5c79576 100644
--- a/config_hub/finetune/gemma-2b/full.yaml
+++ b/config_hub/finetune/gemma-2b/full.yaml
@@ -55,18 +55,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -93,3 +81,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/gemma-2b/lora.yaml b/config_hub/finetune/gemma-2b/lora.yaml
index 91af82800d..41239938a0 100644
--- a/config_hub/finetune/gemma-2b/lora.yaml
+++ b/config_hub/finetune/gemma-2b/lora.yaml
@@ -85,18 +85,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.2
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -123,3 +111,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/gemma-2b/qlora.yaml b/config_hub/finetune/gemma-2b/qlora.yaml
index 159ae2cc86..f927931eac 100644
--- a/config_hub/finetune/gemma-2b/qlora.yaml
+++ b/config_hub/finetune/gemma-2b/qlora.yaml
@@ -85,18 +85,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -123,3 +111,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/gemma-7b/lora.yaml b/config_hub/finetune/gemma-7b/lora.yaml
index 59120c5d0b..171a95fd4a 100644
--- a/config_hub/finetune/gemma-7b/lora.yaml
+++ b/config_hub/finetune/gemma-7b/lora.yaml
@@ -85,18 +85,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -123,3 +111,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/gemma-7b/qlora.yaml b/config_hub/finetune/gemma-7b/qlora.yaml
index 556fba0cf5..dfc04df63f 100644
--- a/config_hub/finetune/gemma-7b/qlora.yaml
+++ b/config_hub/finetune/gemma-7b/qlora.yaml
@@ -85,18 +85,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -123,3 +111,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/llama-2-7b/full.yaml b/config_hub/finetune/llama-2-7b/full.yaml
index 99de788c74..7705daf734 100644
--- a/config_hub/finetune/llama-2-7b/full.yaml
+++ b/config_hub/finetune/llama-2-7b/full.yaml
@@ -58,18 +58,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.1
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -96,3 +84,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/llama-2-7b/lora.yaml b/config_hub/finetune/llama-2-7b/lora.yaml
index 594b2f924d..f736aefa6c 100644
--- a/config_hub/finetune/llama-2-7b/lora.yaml
+++ b/config_hub/finetune/llama-2-7b/lora.yaml
@@ -84,18 +84,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -122,3 +110,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/llama-2-7b/qlora.yaml b/config_hub/finetune/llama-2-7b/qlora.yaml
index 106b9422f4..1ce5273db7 100644
--- a/config_hub/finetune/llama-2-7b/qlora.yaml
+++ b/config_hub/finetune/llama-2-7b/qlora.yaml
@@ -86,18 +86,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -124,3 +112,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/llama-3-8b/full.yaml b/config_hub/finetune/llama-3-8b/full.yaml
index e06d037710..d106d6936e 100644
--- a/config_hub/finetune/llama-3-8b/full.yaml
+++ b/config_hub/finetune/llama-3-8b/full.yaml
@@ -58,18 +58,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.1
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -96,3 +84,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.1
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/llama-3-8b/lora.yaml b/config_hub/finetune/llama-3-8b/lora.yaml
index 1d874a0690..5b20d70169 100644
--- a/config_hub/finetune/llama-3-8b/lora.yaml
+++ b/config_hub/finetune/llama-3-8b/lora.yaml
@@ -84,18 +84,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -122,3 +110,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/llama-3-8b/qlora.yaml b/config_hub/finetune/llama-3-8b/qlora.yaml
index 33a0fc98be..31cc2ec93e 100644
--- a/config_hub/finetune/llama-3-8b/qlora.yaml
+++ b/config_hub/finetune/llama-3-8b/qlora.yaml
@@ -86,18 +86,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -124,3 +112,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/mistral-7b-v0.2/lora.yaml b/config_hub/finetune/mistral-7b-v0.2/lora.yaml
index f56e34c525..eb4228a57e 100644
--- a/config_hub/finetune/mistral-7b-v0.2/lora.yaml
+++ b/config_hub/finetune/mistral-7b-v0.2/lora.yaml
@@ -84,18 +84,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -122,3 +110,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/mistral-7b-v0.2/qlora.yaml b/config_hub/finetune/mistral-7b-v0.2/qlora.yaml
index b648b24d72..e36e5d6925 100644
--- a/config_hub/finetune/mistral-7b-v0.2/qlora.yaml
+++ b/config_hub/finetune/mistral-7b-v0.2/qlora.yaml
@@ -86,18 +86,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -124,3 +112,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/mistral-7b/lora.yaml b/config_hub/finetune/mistral-7b/lora.yaml
index e991ec424e..10e13d935f 100644
--- a/config_hub/finetune/mistral-7b/lora.yaml
+++ b/config_hub/finetune/mistral-7b/lora.yaml
@@ -84,18 +84,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -122,3 +110,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/mistral-7b/qlora.yaml b/config_hub/finetune/mistral-7b/qlora.yaml
index e43b745bb8..a985c6770e 100644
--- a/config_hub/finetune/mistral-7b/qlora.yaml
+++ b/config_hub/finetune/mistral-7b/qlora.yaml
@@ -86,18 +86,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -124,3 +112,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/phi-2/full.yaml b/config_hub/finetune/phi-2/full.yaml
index 5b302a48ac..9509152c08 100644
--- a/config_hub/finetune/phi-2/full.yaml
+++ b/config_hub/finetune/phi-2/full.yaml
@@ -58,18 +58,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.1
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -96,3 +84,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.1
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/phi-2/lora.yaml b/config_hub/finetune/phi-2/lora.yaml
index 2571bc02d0..81da9b0826 100644
--- a/config_hub/finetune/phi-2/lora.yaml
+++ b/config_hub/finetune/phi-2/lora.yaml
@@ -85,18 +85,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -123,3 +111,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/phi-2/qlora.yaml b/config_hub/finetune/phi-2/qlora.yaml
index d48d910939..0c6e91df88 100644
--- a/config_hub/finetune/phi-2/qlora.yaml
+++ b/config_hub/finetune/phi-2/qlora.yaml
@@ -85,18 +85,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -123,3 +111,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/stablelm-base-alpha-3b/full.yaml b/config_hub/finetune/stablelm-base-alpha-3b/full.yaml
index c196fcc017..ecf3d1a25d 100644
--- a/config_hub/finetune/stablelm-base-alpha-3b/full.yaml
+++ b/config_hub/finetune/stablelm-base-alpha-3b/full.yaml
@@ -55,18 +55,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.1
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -93,3 +81,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.1
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml b/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml
index 6e52ea2175..e85dbfd4a4 100644
--- a/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml
+++ b/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml
@@ -84,18 +84,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -122,3 +110,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml b/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml
index ebd2f098eb..2980a7013e 100644
--- a/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml
+++ b/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml
@@ -86,18 +86,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -124,3 +112,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/tiny-llama/full.yaml b/config_hub/finetune/tiny-llama/full.yaml
index fe1d1ef99d..e85f928337 100644
--- a/config_hub/finetune/tiny-llama/full.yaml
+++ b/config_hub/finetune/tiny-llama/full.yaml
@@ -55,18 +55,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -93,3 +81,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/tiny-llama/lora.yaml b/config_hub/finetune/tiny-llama/lora.yaml
index c42ff28ff3..f140a8d26d 100644
--- a/config_hub/finetune/tiny-llama/lora.yaml
+++ b/config_hub/finetune/tiny-llama/lora.yaml
@@ -85,18 +85,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -123,3 +111,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/tiny-llama/qlora.yaml b/config_hub/finetune/tiny-llama/qlora.yaml
index 7e80e4d0ca..bcf8112a01 100644
--- a/config_hub/finetune/tiny-llama/qlora.yaml
+++ b/config_hub/finetune/tiny-llama/qlora.yaml
@@ -85,18 +85,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -123,3 +111,21 @@ logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/pretrain/debug.yaml b/config_hub/pretrain/debug.yaml
index e89dda3cc9..ab848aa341 100644
--- a/config_hub/pretrain/debug.yaml
+++ b/config_hub/pretrain/debug.yaml
@@ -58,18 +58,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
   tie_embeddings:
 
-  #   (type: float, default: 0.0004)
-  learning_rate: 6e-4
-
-  #   (type: float, default: 0.1)
-  weight_decay: 0.1
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: 1.0)
   max_norm: 1.0
 
@@ -91,6 +79,24 @@ eval:
   # Whether to evaluate on the validation set at the beginning of the training
   initial_validation: false
 
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 6e-4
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.1
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
+
 # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
 devices: auto
 
diff --git a/config_hub/pretrain/tinyllama.yaml b/config_hub/pretrain/tinyllama.yaml
index e2418a5b17..5dc8bf64b3 100644
--- a/config_hub/pretrain/tinyllama.yaml
+++ b/config_hub/pretrain/tinyllama.yaml
@@ -58,18 +58,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
   tie_embeddings:
 
-  #   (type: float, default: 0.0004)
-  learning_rate: 4e-4
-
-  #   (type: float, default: 0.1)
-  weight_decay: 0.1
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: 1.0)
   max_norm: 1.0
 
@@ -91,6 +79,24 @@ eval:
   # Whether to evaluate on the validation set at the beginning of the training
   initial_validation: false
 
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 4e-4
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.1
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
+
 # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
 devices: auto
 
diff --git a/config_hub/pretrain/tinystories.yaml b/config_hub/pretrain/tinystories.yaml
index 8ed53a09d7..ba2b03d6e2 100644
--- a/config_hub/pretrain/tinystories.yaml
+++ b/config_hub/pretrain/tinystories.yaml
@@ -74,18 +74,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
   tie_embeddings: true
 
-  #   (type: float, default: 0.0004)
-  learning_rate: 0.0005
-
-  #   (type: float, default: 0.1)
-  weight_decay: 0.1
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: 1.0)
   max_norm: 1.0
 
@@ -107,6 +95,24 @@ eval:
   # Whether to evaluate on the validation set at the beginning of the training
   initial_validation: false
 
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0005
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.1
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
+
 # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
 devices: auto
 
diff --git a/extensions/thunder/pretrain.py b/extensions/thunder/pretrain.py
index 6aa77a745f..757a3ecb99 100644
--- a/extensions/thunder/pretrain.py
+++ b/extensions/thunder/pretrain.py
@@ -8,7 +8,7 @@
 from datetime import timedelta
 from functools import partial
 from pathlib import Path
-from typing import Any, Callable, Optional, Tuple, Union, List
+from typing import Any, Callable, Optional, Tuple, Union, List, Dict
 
 import lightning as L
 import torch
@@ -30,6 +30,7 @@
     choose_logger,
     chunked_cross_entropy,
     copy_config_files,
+    instantiate_torch_optimizer,
     num_parameters,
     parse_devices,
     reset_parameters,
@@ -55,16 +56,13 @@ def setup(
         global_batch_size=512,
         micro_batch_size=4,
         max_tokens=int(3e12),  # 3 trillion
-        learning_rate=4e-4,
-        weight_decay=1e-1,
-        beta1=0.9,
-        beta2=0.95,
         max_norm=1.0,
         min_lr=4e-5,
         lr_warmup_steps=2000,
         tie_embeddings=False,
     ),
     eval: EvalArgs = EvalArgs(interval=1000, max_iters=100),
+    optimizer: Union[str, Dict] = "AdamW",
     devices: Union[int, str] = "auto",
     tokenizer_dir: Optional[Path] = None,
     logger_name: Literal["wandb", "tensorboard", "csv"] = "tensorboard",
@@ -89,6 +87,7 @@ def setup(
         data: Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
         train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details.
         eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details.
+        optimizer: An optimizer name (such as "AdamW") or config.
         devices: How many devices/GPUs to use. Uses all GPUs by default.
         tokenizer_dir: Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
             module require this.
@@ -157,6 +156,7 @@ def setup(
         tokenizer,
         train,
         eval,
+        optimizer,
         compiler,
     )
 
@@ -174,6 +174,7 @@ def main(
     tokenizer: Optional[Tokenizer],
     train: TrainArgs,
     eval: EvalArgs,
+    optimizer: Union[str, Dict],
     compiler: Optional[Literal["thunder", "torch"]],
 ) -> None:
     validate_args(train, eval, initial_checkpoint_dir, resume)
@@ -201,13 +202,7 @@ def main(
     if compiler == "thunder":
         # avoid `Tensor.register_hook` which is unsupported
         model._register_backward_hook = lambda *_: None
-    optimizer = torch.optim.AdamW(
-        model.parameters(),
-        lr=train.learning_rate,
-        weight_decay=train.weight_decay,
-        betas=(train.beta1, train.beta2),
-        fused=True,
-    )
+    optimizer = instantiate_torch_optimizer(optimizer, model.parameters())
     optimizer = fabric.setup_optimizers(optimizer)
 
     train_dataloader, val_dataloader = get_dataloaders(fabric, data, tokenizer, train, model.max_seq_length)
@@ -231,7 +226,7 @@ def main(
         fabric.load(resume, state)
 
     train_time = time.perf_counter()
-    fit(fabric, devices, state, train_dataloader, val_dataloader, out_dir, tokenizer_dir, train, eval)
+    fit(fabric, devices, state, train_dataloader, val_dataloader, out_dir, tokenizer_dir, train, eval, optimizer)
     fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s")
 
     # Save final checkpoint
@@ -251,6 +246,7 @@ def fit(
     tokenizer_dir: Optional[Path],
     train: TrainArgs,
     eval: EvalArgs,
+    optimizer: Union[str, Dict],
 ) -> None:
     model = state["model"]
     optimizer = state["optimizer"]
@@ -288,7 +284,7 @@ def fit(
             break
 
         # determine and set the learning rate for this iteration
-        lr = get_lr(train.learning_rate, state["iter_num"], warmup_iters, max_iters, train.min_lr)
+        lr = get_lr(optimizer.param_groups[0]["lr"], state["iter_num"], warmup_iters, max_iters, train.min_lr)
         for param_group in optimizer.param_groups:
             param_group["lr"] = lr
 
diff --git a/litgpt/__main__.py b/litgpt/__main__.py
index 821c1f5801..2324ce0b30 100644
--- a/litgpt/__main__.py
+++ b/litgpt/__main__.py
@@ -118,7 +118,10 @@ def main() -> None:
             if k == "help":
                 continue
             subsubcommand_parser = _new_parser()
-            subsubcommand_parser.add_function_arguments(v["fn"])
+            if subcommand in ("finetune", "pretrain"):
+                subsubcommand_parser.add_subclass_arguments(torch.optim.Optimizer, "optimizer", instantiate=False, fail_untyped=False, skip={"params"})
+                subsubcommand_parser.set_defaults({"optimizer": "AdamW"})
+            subsubcommand_parser.add_function_arguments(v["fn"], skip={"optimizer"})
             subcommands.add_subcommand(k, subsubcommand_parser, help=v["help"])
 
     args = root_parser.parse_args()
@@ -140,6 +143,10 @@ def main() -> None:
 
     torch.set_float32_matmul_precision("high")
 
+    # dictionary unpacking on the jsonargparse namespace seems to flatten inner namespaces. i dont know if that's a bug or intended
+    # but we can simply convert to dict at this point
+    kwargs = kwargs.as_dict()
+
     fn(**kwargs)
 
 
diff --git a/litgpt/args.py b/litgpt/args.py
index 7e277fe9e6..e3bac05ef2 100644
--- a/litgpt/args.py
+++ b/litgpt/args.py
@@ -33,10 +33,6 @@ class TrainArgs:
     """Whether to tie the embedding weights with the language modeling head weights"""
 
     # Optimization args
-    learning_rate: float = 1e-3
-    weight_decay: float = 0.02
-    beta1: float = 0.9
-    beta2: float = 0.95
     max_norm: Optional[float] = None
     min_lr: float = 6e-5
 
diff --git a/litgpt/finetune/adapter.py b/litgpt/finetune/adapter.py
index 2ec71784e7..3f1030a229 100644
--- a/litgpt/finetune/adapter.py
+++ b/litgpt/finetune/adapter.py
@@ -29,6 +29,8 @@
     copy_config_files,
     get_default_supported_precision,
     init_out_dir,
+    instantiate_torch_optimizer,
+    instantiate_bnb_optimizer,
     load_checkpoint,
     num_parameters,
     parse_devices,
@@ -50,10 +52,10 @@ def setup(
         micro_batch_size=1,
         lr_warmup_steps=100,
         epochs=5,
-        learning_rate=1e-3,
         max_seq_length=None,
     ),
     eval: EvalArgs = EvalArgs(interval=100, max_new_tokens=100, max_iters=100),
+    optimizer: Union[str, Dict] = "AdamW",
     logger_name: Literal["wandb", "tensorboard", "csv"] = "csv",
     seed: int = 1337,
 ) -> None:
@@ -69,6 +71,7 @@ def setup(
         data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
         train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details.
         eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details.
+        optimizer: An optimizer name (such as "AdamW") or config.
         logger_name: The name of the logger to send metrics to.
         seed: The random seed to use for reproducibility.
     """
@@ -109,7 +112,7 @@ def setup(
         strategy = "auto"
 
     fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins)
-    fabric.launch(main, devices, seed, config, data, checkpoint_dir, out_dir, train, eval)
+    fabric.launch(main, devices, seed, config, data, checkpoint_dir, out_dir, train, eval, optimizer)
 
 
 def main(
@@ -122,6 +125,7 @@ def main(
     out_dir: Path,
     train: TrainArgs,
     eval: EvalArgs,
+    optimizer: Union[str, Dict],
 ) -> None:
     validate_args(train, eval)
 
@@ -146,14 +150,10 @@ def main(
     model = fabric.setup_module(model)
 
     if isinstance(fabric.strategy.precision, BitsandbytesPrecision):
-        import bitsandbytes as bnb
-
-        optimizer_cls = bnb.optim.PagedAdamW
+        optimizer = instantiate_bnb_optimizer(optimizer, model.parameters())
     else:
-        optimizer_cls = torch.optim.AdamW
-    optimizer = optimizer_cls(
-        model.parameters(), lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2)
-    )
+        optimizer = instantiate_torch_optimizer(optimizer, model.parameters())
+
     optimizer = fabric.setup_optimizers(optimizer)
     scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps)
 
diff --git a/litgpt/finetune/adapter_v2.py b/litgpt/finetune/adapter_v2.py
index 86526a58e5..785668939e 100644
--- a/litgpt/finetune/adapter_v2.py
+++ b/litgpt/finetune/adapter_v2.py
@@ -29,6 +29,8 @@
     copy_config_files,
     get_default_supported_precision,
     init_out_dir,
+    instantiate_torch_optimizer,
+    instantiate_bnb_optimizer,
     load_checkpoint,
     num_parameters,
     parse_devices,
@@ -50,10 +52,10 @@ def setup(
         micro_batch_size=1,
         lr_warmup_steps=100,
         epochs=5,
-        learning_rate=1e-3,
         max_seq_length=None,
     ),
     eval: EvalArgs = EvalArgs(interval=100, max_new_tokens=100, max_iters=100),
+    optimizer: Union[str, Dict] = "AdamW",
     logger_name: Literal["wandb", "tensorboard", "csv"] = "csv",
     seed: int = 1337,
 ) -> None:
@@ -69,6 +71,7 @@ def setup(
         data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
         train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details.
         eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details.
+        optimizer: An optimizer name (such as "AdamW") or config.
         logger_name: The name of the logger to send metrics to.
         seed: The random seed to use for reproducibility.
     """
@@ -109,7 +112,7 @@ def setup(
         strategy = "auto"
 
     fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins)
-    fabric.launch(main, devices, seed, config, data, checkpoint_dir, out_dir, train, eval)
+    fabric.launch(main, devices, seed, config, data, checkpoint_dir, out_dir, train, eval, optimizer)
 
 
 def main(
@@ -122,6 +125,7 @@ def main(
     out_dir: Path,
     train: TrainArgs,
     eval: EvalArgs,
+    optimizer: Union[str, Dict],
 ) -> None:
     validate_args(train, eval)
 
@@ -146,14 +150,10 @@ def main(
     model = fabric.setup_module(model)
 
     if isinstance(fabric.strategy.precision, BitsandbytesPrecision):
-        import bitsandbytes as bnb
-
-        optimizer_cls = bnb.optim.PagedAdamW
+        optimizer = instantiate_bnb_optimizer(optimizer, model.parameters())
     else:
-        optimizer_cls = torch.optim.AdamW
-    optimizer = optimizer_cls(
-        model.parameters(), lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2)
-    )
+        optimizer = instantiate_torch_optimizer(optimizer, model.parameters())
+
     optimizer = fabric.setup_optimizers(optimizer)
     scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps)
 
diff --git a/litgpt/finetune/full.py b/litgpt/finetune/full.py
index bba6d0ea61..cf32ae501d 100644
--- a/litgpt/finetune/full.py
+++ b/litgpt/finetune/full.py
@@ -28,6 +28,7 @@
     get_default_supported_precision,
     load_checkpoint,
     init_out_dir,
+    instantiate_torch_optimizer,
     num_parameters,
     parse_devices,
     save_hyperparameters,
@@ -48,10 +49,10 @@ def setup(
         micro_batch_size=1,
         lr_warmup_steps=100,
         epochs=5,
-        learning_rate=3e-3,
         max_seq_length=None,
     ),
     eval: EvalArgs = EvalArgs(interval=600, max_new_tokens=100, max_iters=100),
+    optimizer: Union[str, Dict] = "AdamW",
     logger_name: Literal["wandb", "tensorboard", "csv"] = "csv",
     seed: int = 1337,
 ) -> None:
@@ -68,12 +69,14 @@ def setup(
         data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
         train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details.
         eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details.
+        optimizer: An optimizer name (such as "AdamW") or config.
         logger_name: The name of the logger to send metrics to.
         seed: The random seed to use for reproducibility.
     """
-
     pprint(locals())
+
     data = Alpaca() if data is None else data
+
     devices = parse_devices(devices)
     out_dir = init_out_dir(out_dir)
 
@@ -97,7 +100,7 @@ def setup(
         strategy = "auto"
 
     fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger)
-    fabric.launch(main, devices, resume, seed, config, data, checkpoint_dir, out_dir, train, eval)
+    fabric.launch(main, devices, resume, seed, config, data, checkpoint_dir, out_dir, train, eval, optimizer)
 
 
 def main(
@@ -111,6 +114,7 @@ def main(
     out_dir: Path,
     train: TrainArgs,
     eval: EvalArgs,
+    optimizer: Union[str, Dict],
 ) -> None:
     validate_args(train, eval)
 
@@ -131,9 +135,8 @@ def main(
     fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}")
 
     model = fabric.setup(model)
-    optimizer = torch.optim.AdamW(
-        model.parameters(), lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2)
-    )
+
+    optimizer = instantiate_torch_optimizer(optimizer, model.parameters())
     optimizer = fabric.setup_optimizers(optimizer)
     scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps)
     state = {"model": model, "optimizer": optimizer, "scheduler": scheduler, "iter_num": 0, "step_count": 0}
@@ -371,4 +374,3 @@ def validate_args(train: TrainArgs, eval: EvalArgs) -> None:
         issues.append(f"{__file__} requires either epochs or max_steps to be set. This is set in {train}")
     if issues:
         raise ValueError("\n".join(issues))
-
diff --git a/litgpt/finetune/lora.py b/litgpt/finetune/lora.py
index a2c3ef07b2..5f5e12dcf9 100644
--- a/litgpt/finetune/lora.py
+++ b/litgpt/finetune/lora.py
@@ -31,6 +31,8 @@
     get_default_supported_precision,
     load_checkpoint,
     init_out_dir,
+    instantiate_torch_optimizer,
+    instantiate_bnb_optimizer,
     num_parameters,
     parse_devices,
     save_hyperparameters,
@@ -60,10 +62,10 @@ def setup(
         micro_batch_size=1,
         lr_warmup_steps=100,
         epochs=5,
-        learning_rate=3e-4,
         max_seq_length=None,
     ),
     eval: EvalArgs = EvalArgs(interval=100, max_new_tokens=100, max_iters=100),
+    optimizer: Union[str, Dict] = "AdamW",
     logger_name: Literal["wandb", "tensorboard", "csv"] = "csv",
     seed: int = 1337,
 ) -> None:
@@ -88,6 +90,7 @@ def setup(
         data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
         train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details.
         eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details.
+        optimizer: An optimizer name (such as "AdamW") or config.
         logger_name: The name of the logger to send metrics to.
         seed: The random seed to use for reproducibility.
     """
@@ -139,7 +142,7 @@ def setup(
         strategy = "auto"
 
     fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins)
-    fabric.launch(main, devices, seed, config, data, checkpoint_dir, out_dir, train, eval)
+    fabric.launch(main, devices, seed, config, data, checkpoint_dir, out_dir, train, eval, optimizer)
 
 
 def main(
@@ -152,6 +155,7 @@ def main(
     out_dir: Path,
     train: TrainArgs,
     eval: EvalArgs,
+    optimizer: Union[str, Dict],
 ) -> None:
     validate_args(train, eval)
 
@@ -176,14 +180,10 @@ def main(
     model = fabric.setup_module(model)
 
     if isinstance(fabric.strategy.precision, BitsandbytesPrecision):
-        import bitsandbytes as bnb
-
-        optimizer_cls = bnb.optim.PagedAdamW
+        optimizer = instantiate_bnb_optimizer(optimizer, model.parameters())
     else:
-        optimizer_cls = torch.optim.AdamW
-    optimizer = optimizer_cls(
-        model.parameters(), lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2)
-    )
+        optimizer = instantiate_torch_optimizer(optimizer, model.parameters())
+
     optimizer = fabric.setup_optimizers(optimizer)
     scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps)
 
diff --git a/litgpt/pretrain.py b/litgpt/pretrain.py
index 56f4ac9b3c..198a6673c9 100644
--- a/litgpt/pretrain.py
+++ b/litgpt/pretrain.py
@@ -6,7 +6,7 @@
 from datetime import timedelta
 from functools import partial
 from pathlib import Path
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Union, Dict
 
 import lightning as L
 import torch
@@ -23,7 +23,6 @@
 from litgpt.data import DataModule, TinyLlama
 from litgpt.model import GPT, Block, CausalSelfAttention, Config, LLaMAMLP
 from litgpt.utils import (
-    CLI,
     CycleIterator,
     capture_hparams,
     choose_logger,
@@ -31,6 +30,7 @@
     copy_config_files,
     get_default_supported_precision,
     init_out_dir,
+    instantiate_torch_optimizer,
     num_parameters,
     parse_devices,
     reset_parameters,
@@ -53,16 +53,13 @@ def setup(
         global_batch_size=512,
         micro_batch_size=4,
         max_tokens=int(3e12),  # 3 trillion
-        learning_rate=4e-4,
-        weight_decay=1e-1,
-        beta1=0.9,
-        beta2=0.95,
         max_norm=1.0,
         min_lr=4e-5,
         lr_warmup_steps=2000,
         tie_embeddings=False,
     ),
     eval: EvalArgs = EvalArgs(interval=1000, max_iters=100),
+    optimizer: Union[str, Dict] = "AdamW",
     devices: Union[int, str] = "auto",
     tokenizer_dir: Optional[Path] = None,
     logger_name: Literal["wandb", "tensorboard", "csv"] = "tensorboard",
@@ -85,6 +82,8 @@ def setup(
         data: Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
         train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details.
         eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details.
+        optimizer: An optimizer name (such as "AdamW") or config.
+        
         devices: How many devices/GPUs to use. Uses all GPUs by default.
         tokenizer_dir: Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
             module require this.
@@ -133,6 +132,7 @@ def setup(
         tokenizer,
         train,
         eval,
+        optimizer,
     )
 
 
@@ -149,6 +149,7 @@ def main(
     tokenizer: Optional[Tokenizer],
     train: TrainArgs,
     eval: EvalArgs,
+    optimizer: Union[str, Dict],
 ) -> None:
     validate_args(train, eval, initial_checkpoint_dir, resume)
 
@@ -174,13 +175,8 @@ def main(
     model = torch.compile(model)
     model = fabric.setup(model)
 
-    optimizer = torch.optim.AdamW(
-        model.parameters(),
-        lr=train.learning_rate,
-        weight_decay=train.weight_decay,
-        betas=(train.beta1, train.beta2),
-        fused=fabric.device.type == "cuda",
-    )
+    extra_kwargs = {"fused": fabric.device.type == "cuda"}
+    optimizer = instantiate_torch_optimizer(optimizer, model.parameters(), **extra_kwargs)
     optimizer = fabric.setup_optimizers(optimizer)
 
     train_dataloader, val_dataloader = get_dataloaders(fabric, data, tokenizer, train, model.max_seq_length)
@@ -266,7 +262,7 @@ def fit(
             break
 
         # determine and set the learning rate for this iteration
-        lr = get_lr(train.learning_rate, state["iter_num"], warmup_iters, max_iters, train.min_lr)
+        lr = get_lr(optimizer.param_groups[0]["lr"], state["iter_num"], warmup_iters, max_iters, train.min_lr)
         for param_group in optimizer.param_groups:
             param_group["lr"] = lr
 
@@ -442,4 +438,3 @@ def validate_args(train: TrainArgs, eval: EvalArgs, initial_checkpoint_dir, resu
         issues.append("Can't provide both `--resume` and `--initial_checkpoint_dir`. Choose one.")
     if issues:
         raise ValueError("\n".join(issues))
-
diff --git a/litgpt/utils.py b/litgpt/utils.py
index 18aea56d64..9225af8911 100644
--- a/litgpt/utils.py
+++ b/litgpt/utils.py
@@ -21,6 +21,7 @@
 from lightning.fabric.strategies import FSDPStrategy
 from lightning.fabric.utilities.load import _lazy_load as lazy_load
 from lightning.pytorch.loggers import WandbLogger
+from lightning.pytorch.cli import instantiate_class
 from torch.serialization import normalize_storage_type
 from typing_extensions import Self
 
@@ -486,3 +487,32 @@ def choose_logger(
     if logger_name == "wandb":
         return WandbLogger(project=name, resume=resume, **kwargs)
     raise ValueError(f"`--logger_name={logger_name}` is not a valid option. Choose from 'csv', 'tensorboard', 'wandb'.")
+
+
+def get_argument_names(cls):
+    sig = inspect.signature(cls.__init__)
+    return {name for name, param in sig.parameters.items()
+            if param.kind in [inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY]}
+
+
+def instantiate_bnb_optimizer(optimizer, model_parameters):
+    if (isinstance(optimizer, str) and "AdamW" not in optimizer) or (isinstance(optimizer, dict) and "AdamW" not in optimizer.get("class_path", "")):
+        raise ValueError("The chosen quantization format only supports the AdamW optimizer.")
+
+    import bitsandbytes as bnb
+    if isinstance(optimizer, str):
+        optimizer = bnb.optim.PagedAdamW(model_parameters)
+    else:
+        optim_args = get_argument_names(bnb.optim.PagedAdamW)
+        allowed_kwargs = {key: optimizer["init_args"][key] for key in optim_args & optimizer["init_args"].keys()}
+        optimizer = bnb.optim.PagedAdamW(model_parameters, **allowed_kwargs)
+    return optimizer
+
+
+def instantiate_torch_optimizer(optimizer, model_parameters, **kwargs):
+    if isinstance(optimizer, str):
+        optimizer_cls = getattr(torch.optim, optimizer)
+        optimizer = optimizer_cls(model_parameters, **kwargs)
+    else:
+        optimizer = instantiate_class(model_parameters, optimizer, **kwargs)
+    return optimizer
diff --git a/tests/test_config_hub.py b/tests/test_config_hub.py
index 4ad634ca9b..8987cc0071 100644
--- a/tests/test_config_hub.py
+++ b/tests/test_config_hub.py
@@ -16,7 +16,7 @@
     ("litgpt/pretrain.py", "pretrain/tinystories.yaml"),
     (
         "litgpt/pretrain.py",
-        "https://raw.githubusercontent.com/Lightning-AI/litgpt/main/config_hub/pretrain/tinystories.yaml",
+        "https://raw.githubusercontent.com/Lightning-AI/litgpt/eb6ec386a9ffc8214f6435cb8345789b3b31a267/config_hub/pretrain/tinystories.yaml",
     ),
 ]
 
diff --git a/tests/test_thunder_pretrain.py b/tests/test_thunder_pretrain.py
index 30f9d71afb..e941ad7949 100644
--- a/tests/test_thunder_pretrain.py
+++ b/tests/test_thunder_pretrain.py
@@ -37,6 +37,7 @@ def test_pretrain(tmp_path, monkeypatch):
             out_dir=out_dir,
             train=TrainArgs(global_batch_size=2, max_tokens=16, save_interval=1, micro_batch_size=1, max_norm=1.0),
             eval=EvalArgs(interval=1, max_iters=1),
+            optimizer="AdamW",
         )
 
     out_dir_contents = set(os.listdir(out_dir))
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 554929c77f..ebb018ef5a 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -16,6 +16,7 @@
 from lightning.fabric.loggers import CSVLogger, TensorBoardLogger
 from lightning.fabric.plugins import BitsandbytesPrecision
 from lightning.pytorch.loggers import WandbLogger
+from lightning.pytorch.cli import instantiate_class
 from lightning_utilities.core.imports import RequirementCache
 
 from litgpt import GPT
@@ -29,8 +30,11 @@
     chunked_cross_entropy,
     copy_config_files,
     find_multiple,
+    get_argument_names,
     incremental_save,
     init_out_dir,
+    instantiate_bnb_optimizer, 
+    instantiate_torch_optimizer,
     num_parameters,
     parse_devices,
     save_hyperparameters,
@@ -306,4 +310,46 @@ def test_init_out_dir(tmp_path):
 
     with mock.patch.dict(os.environ, {"LIGHTNING_ARTIFACTS_DIR": "prefix"}):
         assert init_out_dir(relative_path) == Path("prefix") / relative_path
-        assert init_out_dir(absolute_path) == absolute_path
\ No newline at end of file
+        assert init_out_dir(absolute_path) == absolute_path
+
+
+@pytest.fixture
+def model_parameters():
+    return [torch.nn.Parameter(torch.randn(2, 2))]
+
+
+def test_instantiate_bnb_optimizer_with_str(model_parameters):
+    import bitsandbytes as bnb
+    with mock.patch("litgpt.utils.get_argument_names", return_value={"lr", "eps", "weight_decay"}):
+        optimizer = instantiate_bnb_optimizer("AdamW", model_parameters)
+        assert isinstance(optimizer, bnb.optim.adamw.PagedAdamW)
+
+
+def test_instantiate_bnb_optimizer_with_dict(model_parameters):
+    import bitsandbytes as bnb
+    optimizer_dict = {"class_path": "AdamW", "init_args": {"lr": 0.01}}
+    with mock.patch("litgpt.utils.get_argument_names", return_value={"lr", "eps", "weight_decay"}):
+        optimizer = instantiate_bnb_optimizer(optimizer_dict, model_parameters)
+        assert isinstance(optimizer, bnb.optim.adamw.PagedAdamW)
+        assert optimizer.param_groups[0]["lr"] == 0.01
+
+
+def test_instantiate_bnb_optimizer_with_invalid_str(model_parameters):
+    with pytest.raises(ValueError, match="only supports the AdamW"):
+        instantiate_bnb_optimizer("SGD", model_parameters)
+
+
+def test_instantiate_torch_optimizer_with_str(model_parameters):
+    with mock.patch("litgpt.utils.instantiate_class") as mock_instantiate_class:
+        mock_instantiate_class.return_value = torch.optim.Adam(model_parameters, lr=0.01)
+        optimizer = instantiate_torch_optimizer("Adam", model_parameters, lr=0.01)
+        assert isinstance(optimizer, torch.optim.Adam)
+        assert optimizer.param_groups[0]["lr"] == 0.01
+
+
+def test_instantiate_torch_optimizer_with_class(model_parameters):
+    with mock.patch("litgpt.utils.instantiate_class") as mock_instantiate_class:
+        mock_instantiate_class.return_value = torch.optim.Adam(model_parameters, lr=0.02)
+        optimizer = instantiate_torch_optimizer(torch.optim.Adam, model_parameters, lr=0.02)
+        assert isinstance(optimizer, torch.optim.Adam)
+        assert optimizer.param_groups[0]["lr"] == 0.02
diff --git a/tutorials/pretrain.md b/tutorials/pretrain.md
index ce8f92b0e7..9b83d93caa 100644
--- a/tutorials/pretrain.md
+++ b/tutorials/pretrain.md
@@ -79,8 +79,8 @@ litgpt pretrain \
    --tokenizer_dir checkpoints/EleutherAI/pythia-14m \
    --data TextFiles \
    --data.train_data_path custom_pretraining_data \
-   --train.learning_rate 0.005 \
    --train.lr_warmup_steps=200
+   --optimizer.lr 0.005
 ```
 
 
@@ -117,8 +117,8 @@ litgpt pretrain \
    --out_dir new_phi-2_checkpoint \
    --data TextFiles \
    --data.train_data_path custom_pretraining_data \
-   --train.learning_rate 0.005 \
    --train.lr_warmup_steps=200
+   --optimizer.lr 0.005
 ```