Lightning-AI · davmacario · May 23, 2024 · May 23, 2024 · May 23, 2024 · May 23, 2024
diff --git a/.gitignore b/.gitignore
@@ -18,5 +18,6 @@ wandb
 events.out.tfevents*
 
 # test artifacts from tests/test_readme.py
-tests/custom_finetuning_dataset.json
-tests/custom_texts
+**/custom_finetuning_dataset.json
+client.py
+**/custom_texts/
diff --git a/README.md b/README.md
@@ -7,7 +7,11 @@
 
 Uses the latest state-of-the-art techniques:
 
-✅ flash attention &nbsp; &nbsp;  ✅ fp4/8/16/32 &nbsp; &nbsp;  ✅ LoRA, QLoRA, Adapter (v1, v2) &nbsp; &nbsp;  ✅ FSDP &nbsp; &nbsp;  ✅ 1-1000+ GPUs/TPUs
+<pre>
+✅ flash attention    ✅ fp4/8/16/32        ✅ LoRA, QLoRA, Adapter
+✅ FSDP               ✅ 1-1000+ GPUs/TPUs  ✅ 20+ LLMs            
+</pre>
+
 
 ---
 
@@ -69,30 +73,34 @@ LitGPT has 🤯 **custom, from-scratch implementations** of [20+ LLMs](tutorials
 
 | Model | Model size | Author | Reference |
 |----|----|----|----|
-| CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) |
-| Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) |
-| Danube2 | 1.8B | H2O.ai | [H2O.ai](https://h2o.ai/platform/danube-1-8b/) |
-| Dolly | 3B, 7B, 12B | Databricks | [Conover et al. 2023](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) |
-| Falcon | 7B, 40B, 180B | TII UAE | [TII 2023](https://falconllm.tii.ae)                                                                                         |
-| FreeWilly2 (Stable Beluga 2) | 70B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models)                             |
-| Function Calling Llama 2 | 7B | Trelis | [Trelis et al. 2023](https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2)                                   |
-| Gemma | 2B, 7B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf)                         |
-| Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)                                                                      |
-| Llama 3 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                                                     |
-| LongChat | 7B, 13B | LMSYS | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/)                                                            |
-| Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                                                      |
-| Mistral | 7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                                                         |
-| Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch)                                                                              |
-| OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                                             |
-| Phi | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                           |
+| CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma)                                                                 |
+| Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950)                                                                   |
+| Danube2 | 1.8B | H2O.ai | [H2O.ai](https://h2o.ai/platform/danube-1-8b/)                                                                                             |
+| Dolly | 3B, 7B, 12B | Databricks | [Conover et al. 2023](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm)      |
+| Falcon | 7B, 40B, 180B | TII UAE | [TII 2023](https://falconllm.tii.ae)                                                                                              |
+| FreeWilly2 (Stable Beluga 2) | 70B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models)                 |
+| Function Calling Llama 2 | 7B | Trelis | [Trelis et al. 2023](https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2)                                  |
+| Gemma | 2B, 7B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf)                                       |
+| Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)                                                                           |
+| Llama 3 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                                                                   |
+| LongChat | 7B, 13B | LMSYS | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/)                                                                       |
+| MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama)
+| Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                                                     |
+| Mistral | 7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                                                        |
+| Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch)                                                                          |
+| OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                                         |
+| Phi | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                          |
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
-| Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                                                     |
-| RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                                                |
-| StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                           |
-| StableLM  | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM)                                                                |
-| StableLM Zephyr | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                           |
-| TinyLlama | 1.1B | Zhang et al. | [Zhang et al. 2023](https://github.com/jzhang38/TinyLlama)                                                                   |
-| Vicuna | 7B, 13B, 33B | LMSYS | [Li et al. 2023](https://lmsys.org/blog/2023-03-30-vicuna/)
+| Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
+| RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                                                 |
+| StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                                  |
+| StableLM  | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM)                                                                    |
+| StableLM Zephyr | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                             |
+| TinyLlama | 1.1B | Zhang et al. | [Zhang et al. 2023](https://github.com/jzhang38/TinyLlama)                                                                         |
+| Vicuna | 7B, 13B, 33B | LMSYS | [Li et al. 2023](https://lmsys.org/blog/2023-03-30-vicuna/)                                                                          |
+
+**Tip**: You can list all available models by running the `litgpt download list` command.
+
 
 </details>
 
@@ -138,7 +146,7 @@ litgpt  serve     meta-llama/Meta-Llama-3-8B-Instruct
 &nbsp;
 
 ###  Use an LLM for inference
-Use LLMs for inference to test its chatting capabilities, run evaluations, or extract embeddings, etc...
+Use LLMs for inference to test its chatting capabilities, run evaluations, or extract embeddings, etc.
 Here's an example showing how to use the Phi-2 LLM.
 
 <a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-chat">
@@ -148,12 +156,14 @@ Here's an example showing how to use the Phi-2 LLM.
 &nbsp;
 
 ```bash
-# 1) Download a pretrained model
-litgpt download --repo_id microsoft/phi-2
+# 1) List all available models in litgpt
+litgpt download list
 
-# 2) Chat with the model
-litgpt chat \
-  --checkpoint_dir checkpoints/microsoft/phi-2
+# 2) Download a pretrained model
+litgpt download microsoft/phi-2
+
+# 3) Chat with the model
+litgpt chat microsoft/phi-2
 
 >> Prompt: What do Llamas eat?
 ```
@@ -174,21 +184,19 @@ For more information on the different inference options, refer to the [inference
 
 ```bash
 # 1) Download a pretrained model
-litgpt download --repo_id microsoft/phi-2
+litgpt download microsoft/phi-2
 
 # 2) Finetune the model
 curl -L https://huggingface.co/datasets/ksaw008/finance_alpaca/resolve/main/finance_alpaca.json -o my_custom_dataset.json
 
-litgpt finetune \
-  --checkpoint_dir checkpoints/microsoft/phi-2 \
+litgpt finetune microsoft/phi-2 \
   --data JSON \
   --data.json_path my_custom_dataset.json \
   --data.val_split_fraction 0.1 \
   --out_dir out/custom-model
 
 # 3) Chat with the model
-litgpt chat \
-  --checkpoint_dir out/custom-model/final
+litgpt chat out/custom-model/final
 ```
 
 &nbsp;
@@ -208,22 +216,19 @@ curl https://www.gutenberg.org/cache/epub/24440/pg24440.txt --output custom_text
 curl https://www.gutenberg.org/cache/epub/26393/pg26393.txt --output custom_texts/book2.txt
 
 # 1) Download a tokenizer
-litgpt download \
-  --repo_id EleutherAI/pythia-160m \
+litgpt download EleutherAI/pythia-160m \
   --tokenizer_only True
 
 # 2) Pretrain the model
-litgpt pretrain \
-  --model_name pythia-160m \
-  --tokenizer_dir checkpoints/EleutherAI/pythia-160m \
+litgpt pretrain EleutherAI/pythia-160m \
+  --tokenizer_dir EleutherAI/pythia-160m \
   --data TextFiles \
   --data.train_data_path "custom_texts/" \
   --train.max_tokens 10_000_000 \
   --out_dir out/custom-model
 
 # 3) Chat with the model
-litgpt chat \
-  --checkpoint_dir out/custom-model/final
+litgpt chat out/custom-model/final
 ```
 
 &nbsp;
@@ -244,21 +249,19 @@ curl https://www.gutenberg.org/cache/epub/24440/pg24440.txt --output custom_text
 curl https://www.gutenberg.org/cache/epub/26393/pg26393.txt --output custom_texts/book2.txt
 
 # 1) Download a pretrained model
-litgpt download --repo_id EleutherAI/pythia-160m
+litgpt download EleutherAI/pythia-160m
 
 # 2) Continue pretraining the model
-litgpt pretrain \
-  --model_name pythia-160m \
-  --tokenizer_dir checkpoints/EleutherAI/pythia-160m \
-  --initial_checkpoint_dir checkpoints/EleutherAI/pythia-160m \
+litgpt pretrain EleutherAI/pythia-160m \
+  --tokenizer_dir EleutherAI/pythia-160m \
+  --initial_checkpoint_dir EleutherAI/pythia-160m \
   --data TextFiles \
   --data.train_data_path "custom_texts/" \
   --train.max_tokens 10_000_000 \
   --out_dir out/custom-model
 
 # 3) Chat with the model
-litgpt chat \
-  --checkpoint_dir out/custom-model/final
+litgpt chat out/custom-model/final
 ```
 
 &nbsp;
@@ -274,11 +277,11 @@ Once you're ready to deploy a finetuned LLM, run this command:
 
 ```bash
 # locate the checkpoint to your finetuned or pretrained model and call the `serve` command:
-litgpt serve --checkpoint_dir path/to/your/checkpoint/microsoft/phi-2
+litgpt serve microsoft/phi-2
 
 # Alternative: if you haven't finetuned, download any checkpoint to deploy it:
-litgpt download --repo_id microsoft/phi-2
-litgpt serve --checkpoint_dir checkpoints/microsoft/phi-2
+litgpt download microsoft/phi-2
+litgpt serve microsoft/phi-2
 ```
 
 Test the server in a separate terminal and integrate the model API into your AI product:

diff --git a/config_hub/finetune/falcon-7b/lora.yaml b/config_hub/finetune/falcon-7b/lora.yaml
@@ -84,18 +84,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -117,8 +105,29 @@ eval:
   # Whether to evaluate on the validation set at the beginning of the training
   initial_validation: false
 
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+
+  init_args:
+
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/falcon-7b/qlora.yaml b/config_hub/finetune/falcon-7b/qlora.yaml
@@ -86,18 +86,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -119,8 +107,29 @@ eval:
   # Whether to evaluate on the validation set at the beginning of the training
   initial_validation: false
 
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+
+  init_args:
+
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/gemma-2b/full.yaml b/config_hub/finetune/gemma-2b/full.yaml
@@ -55,18 +55,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.0
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -88,8 +76,29 @@ eval:
   # Whether to evaluate on the validation set at the beginning of the training
   initial_validation: false
 
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+
+  init_args:
+
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/gemma-2b/lora.yaml b/config_hub/finetune/gemma-2b/lora.yaml
@@ -85,18 +85,6 @@ train:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
   tie_embeddings:
 
-  #   (type: float, default: 0.0003)
-  learning_rate: 0.0002
-
-  #   (type: float, default: 0.02)
-  weight_decay: 0.2
-
-  #   (type: float, default: 0.9)
-  beta1: 0.9
-
-  #   (type: float, default: 0.95)
-  beta2: 0.95
-
   #   (type: Optional[float], default: null)
   max_norm:
 
@@ -118,8 +106,29 @@ eval:
   # Whether to evaluate on the validation set at the beginning of the training
   initial_validation: false
 
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
 # The random seed to use for reproducibility. (type: int, default: 1337)
 seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+
+  init_args:
+
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95