diff --git a/config_hub/finetune/README.md b/config_hub/finetune/README.md index c31a862380..aac3b4e100 100644 --- a/config_hub/finetune/README.md +++ b/config_hub/finetune/README.md @@ -27,7 +27,7 @@ For more information, see the [Dealing with out-of-memory (OOM) errors](../../tu | | | | | | | | | | | | phi-2/lora.yaml | 2B | Alpaca 2k | 1 | 0.832 | 13.98 GB | 512 | 4 | bfloat16 | 3.82 min (1xA10G) | | phi-2/qlora.yaml | 2B | Alpaca 2k | 1 | 0.846 | 14.27 GB | 512 | 4 | bfloat16 | 4.55 min (1xA10G) | -| phi-2/full.yaml | 2B | Alpaca 2k | 1 | 0.937 | 14.44 GB | 512 | 4 | bfloat16 | 13.00 min (1xA10G) | +| phi-2/full.yaml | 2B | Alpaca 2k | 1 | 0.937 | 14.44 GB | 512 | 4 | bfloat16 | 13.00 min (2xA10G) | | | | | | | | | | | | | stablelm-base-alpha-3b/lora.yaml | 7B | Alpaca 2k | 4 | 1.367 | 8.58 GB | 512 | 2 | bfloat16 | 13.02 min (1xA10G) | | stablelm-base-alpha-3b/qlora.yaml | 7B | Alpaca 2k | 4 | 1.392 | 5.24 GB | 512 | 2 | bfloat16 | 25.71 min (1xA10G) | diff --git a/litgpt/finetune/adapter.py b/litgpt/finetune/adapter.py index 304ea6bd3a..e60aadd312 100644 --- a/litgpt/finetune/adapter.py +++ b/litgpt/finetune/adapter.py @@ -17,7 +17,7 @@ from litgpt.adapter import GPT, Block, Config, adapter_filter, mark_only_adapter_as_trainable from litgpt.args import EvalArgs, TrainArgs -from litgpt.data import Alpaca, DataModule +from litgpt.data import Alpaca2k, DataModule from litgpt.generate.base import generate from litgpt.prompts import save_prompt_style from litgpt.tokenizer import Tokenizer @@ -46,14 +46,14 @@ def setup( train: TrainArgs = TrainArgs( save_interval=1000, log_interval=1, - global_batch_size=16, + global_batch_size=8, micro_batch_size=1, - lr_warmup_steps=100, - epochs=5, - learning_rate=1e-3, + lr_warmup_steps=10, + epochs=1, + learning_rate=0.002, max_seq_length=None, ), - eval: EvalArgs = EvalArgs(interval=100, max_new_tokens=100, max_iters=100), + eval: EvalArgs = EvalArgs(interval=50, max_new_tokens=100, max_iters=100), logger_name: Literal["wandb", "tensorboard", "csv"] = "csv", seed: int = 1337, ) -> None: @@ -65,7 +65,7 @@ def setup( precision: The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". quantize: If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. devices: How many devices/GPUs to use. - data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. + data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca2k``. train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details. eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details. logger_name: The name of the logger to send metrics to. @@ -73,7 +73,7 @@ def setup( """ pprint(locals()) - data = Alpaca() if data is None else data + data = Alpaca2k() if data is None else data devices = parse_devices(devices) check_valid_checkpoint_dir(checkpoint_dir) diff --git a/litgpt/finetune/adapter_v2.py b/litgpt/finetune/adapter_v2.py index d925f99ce1..894975392b 100644 --- a/litgpt/finetune/adapter_v2.py +++ b/litgpt/finetune/adapter_v2.py @@ -17,7 +17,7 @@ from litgpt.adapter_v2 import GPT, Block, Config, adapter_filter, mark_only_adapter_v2_as_trainable from litgpt.args import EvalArgs, TrainArgs -from litgpt.data import Alpaca, DataModule +from litgpt.data import Alpaca2k, DataModule from litgpt.generate.base import generate from litgpt.prompts import save_prompt_style from litgpt.tokenizer import Tokenizer @@ -46,14 +46,14 @@ def setup( train: TrainArgs = TrainArgs( save_interval=1000, log_interval=1, - global_batch_size=16, + global_batch_size=8, micro_batch_size=1, - lr_warmup_steps=100, - epochs=5, - learning_rate=1e-3, + lr_warmup_steps=10, + epochs=1, + learning_rate=0.002, max_seq_length=None, ), - eval: EvalArgs = EvalArgs(interval=100, max_new_tokens=100, max_iters=100), + eval: EvalArgs = EvalArgs(interval=50, max_new_tokens=100, max_iters=100), logger_name: Literal["wandb", "tensorboard", "csv"] = "csv", seed: int = 1337, ) -> None: @@ -65,7 +65,7 @@ def setup( precision: The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". quantize: If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. devices: How many devices/GPUs to use. - data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. + data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca2k``. train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details. eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details. logger_name: The name of the logger to send metrics to. @@ -73,7 +73,7 @@ def setup( """ pprint(locals()) - data = Alpaca() if data is None else data + data = Alpaca2k() if data is None else data devices = parse_devices(devices) check_valid_checkpoint_dir(checkpoint_dir) diff --git a/litgpt/finetune/full.py b/litgpt/finetune/full.py index fdcd6bff1e..4aa5f68068 100644 --- a/litgpt/finetune/full.py +++ b/litgpt/finetune/full.py @@ -14,7 +14,7 @@ from torchmetrics import RunningMean from litgpt.args import EvalArgs, TrainArgs -from litgpt.data import Alpaca, DataModule +from litgpt.data import Alpaca2k, DataModule from litgpt.generate.base import generate from litgpt.model import GPT, Block, Config from litgpt.prompts import save_prompt_style @@ -46,12 +46,12 @@ def setup( log_interval=1, global_batch_size=16, micro_batch_size=1, - lr_warmup_steps=100, - epochs=5, - learning_rate=3e-3, + lr_warmup_steps=1000, + epochs=1, + learning_rate=0.0002, max_seq_length=None, ), - eval: EvalArgs = EvalArgs(interval=600, max_new_tokens=100, max_iters=100), + eval: EvalArgs = EvalArgs(interval=50, max_new_tokens=100, max_iters=100), logger_name: Literal["wandb", "tensorboard", "csv"] = "csv", seed: int = 1337, ) -> None: @@ -64,7 +64,7 @@ def setup( devices: How many devices/GPUs to use resume: Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume from the latest checkpoint in ``out_dir``. - data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. + data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca2k``. train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details. eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details. logger_name: The name of the logger to send metrics to. @@ -72,7 +72,7 @@ def setup( """ pprint(locals()) - data = Alpaca() if data is None else data + data = Alpaca2k() if data is None else data devices = parse_devices(devices) check_valid_checkpoint_dir(checkpoint_dir) diff --git a/litgpt/finetune/lora.py b/litgpt/finetune/lora.py index 25ae0df839..21ccbb21ab 100644 --- a/litgpt/finetune/lora.py +++ b/litgpt/finetune/lora.py @@ -16,7 +16,7 @@ from torchmetrics import RunningMean from litgpt.args import EvalArgs, TrainArgs -from litgpt.data import Alpaca, DataModule +from litgpt.data import Alpaca2k, DataModule from litgpt.generate.base import generate from litgpt.lora import GPT, Block, Config, lora_filter, mark_only_lora_as_trainable from litgpt.prompts import save_prompt_style @@ -43,7 +43,7 @@ def setup( precision: Optional[str] = None, quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None, devices: Union[int, str] = 1, - lora_r: int = 8, + lora_r: int = 32, lora_alpha: int = 16, lora_dropout: float = 0.05, lora_query: bool = True, @@ -56,11 +56,11 @@ def setup( train: TrainArgs = TrainArgs( save_interval=1000, log_interval=1, - global_batch_size=16, + global_batch_size=8, micro_batch_size=1, lr_warmup_steps=100, - epochs=5, - learning_rate=3e-4, + epochs=4, + learning_rate=0.0002, max_seq_length=None, ), eval: EvalArgs = EvalArgs(interval=100, max_new_tokens=100, max_iters=100), @@ -84,7 +84,7 @@ def setup( lora_projection: Whether to apply LoRA to the output projection in the attention block. lora_mlp: Whether to apply LoRA to the weights of the MLP in the attention block. lora_head: Whether to apply LoRA to output head in GPT. - data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. + data: Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca2k``. train: Training-related arguments. See ``litgpt.args.TrainArgs`` for details. eval: Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details. logger_name: The name of the logger to send metrics to. @@ -92,7 +92,7 @@ def setup( """ pprint(locals()) - data = Alpaca() if data is None else data + data = Alpaca2k() if data is None else data devices = parse_devices(devices) check_valid_checkpoint_dir(checkpoint_dir) diff --git a/litgpt/utils.py b/litgpt/utils.py index fb6a86c107..07a88bbd3a 100644 --- a/litgpt/utils.py +++ b/litgpt/utils.py @@ -305,7 +305,7 @@ def get_default_supported_precision(training: bool) -> str: if MPSAccelerator.is_available() or (torch.cuda.is_available() and not torch.cuda.is_bf16_supported()): return "16-mixed" if training else "16-true" - return "bf16-mixed" if training else "bf16-true" + return "bf16-true" def load_checkpoint(fabric: L.Fabric, model: nn.Module, checkpoint_path: Path, strict: bool = True) -> None: diff --git a/tutorials/finetune_adapter.md b/tutorials/finetune_adapter.md index 9534d3de9c..1446de67c5 100644 --- a/tutorials/finetune_adapter.md +++ b/tutorials/finetune_adapter.md @@ -2,7 +2,7 @@ Adapter, first introduced for the LLaMA model as [LLaMA-Adapter](https://arxiv.org/abs/2303.16199), is a form of prefix-tuning that prepends a learnable adaption-prompt to the inputs of the attention blocks in an LLM. In total, there are only ~500k parameters to update during finetuning in StableLM 3B, which significantly reduces the memory footprint and speeds up training. -We are able to demonstrate instruction-finetuning LitGPT StableLM 3B on the [Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset on a **single RTX 3060 GPU**. If using 8 GPUs, finetuning can be completed in under 1 hour. +We are able to demonstrate instruction-finetuning LitGPT StableLM 3B on the Alpaca 2k dataset (a subset of [Alpcaca](https://github.com/tatsu-lab/stanford_alpaca)) on a **single RTX 3060 GPU**. If using 8 GPUs, finetuning can be completed in under 1 hour. If you are new to Adapter and are interested to learn more about how it works before proceeding with the finetuning guide below, you might find our article [Understanding Parameter-Efficient Finetuning of Large Language Models: From Prefix Tuning to LLaMA-Adapters](https://lightning.ai/pages/community/article/understanding-llama-adapters/) helpful. @@ -19,38 +19,85 @@ LitGPT provides common datasets for finetuning, such as Alpaca, LIMA, Dolly, and You can optionally [prepare your own dataset](#tune-on-your-dataset). For more information about dataset preparation, also see the [prepare_dataset.md](./prepare_dataset.md) tutorial. -## Running the finetuning +For example, ```bash -litgpt finetune adapter \ - --data Alpaca \ - --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b +litgpt download --repo_id stablelm-base-alpha-3b ``` -or for Adapter V2 +LitGPT provides common datasets for finetuning, such as Alpaca 2k, Alpaca, LIMA, Dolly, and more. +You can optionally [prepare your own dataset](#tune-on-your-dataset). +For more information about dataset preparation, also see the [prepare_dataset.md](./prepare_dataset.md) tutorial. + +  + +## Running the finetuning + +To finetune the default `"stablelm-base-alpha-3b"` model on Alpaca2k, run the following command: ```bash -litgpt finetune adapter_v2 \ - --data Alpaca \ - --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b +litgpt finetune adapter --data Alpaca2k ``` -The finetuning requires at least one GPU with ~12 GB memory. -You can speed up training by passing the `devices` argument to the script to utilize more GPUs if available. -Depending on the available GPU memory, you can also tune the `micro_batch_size` parameter to utilize the GPU efficiently. -To fit Adapter V2 to 12GB memory set `--train.micro_batch_size 2`. - -For example, the following settings will let you finetune the model in under 1 hour: +Alternatively, you can use Adapter V2 as follows: ```bash ---devices 4 --train.micro_batch_size 4 +litgpt finetune adapter_v2 +``` + +The preceeding code will initiate the training, which will print the following outputs (via an A10G GPU): + +``` +{'checkpoint_dir': PosixPath('checkpoints/stabilityai/stablelm-base-alpha-3b'), + 'data': Alpaca2k, + 'devices': 1, + 'eval': EvalArgs(interval=50, max_new_tokens=100, max_iters=100), + 'logger_name': 'csv', + 'out_dir': PosixPath('out/finetune/adapter-v2'), + 'precision': None, + 'quantize': None, + 'seed': 1337, + 'train': TrainArgs(save_interval=1000, + log_interval=1, + global_batch_size=8, + micro_batch_size=1, + lr_warmup_steps=10, + epochs=1, + max_tokens=None, + max_steps=None, + max_seq_length=None, + tie_embeddings=None, + learning_rate=0.002, + weight_decay=0.02, + beta1=0.9, + beta2=0.95, + max_norm=None, + min_lr=6e-05)} +Seed set to 1337 +Number of trainable parameters: 2,125,248 +Number of non-trainable parameters: 3,637,051,392 +The longest sequence length in the train data is 634, the model's maximum sequence length is 634 and context length is 4096 +... +Epoch 1 | iter 1 step 0 | loss train: 1.919, val: n/a | iter time: 304.25 ms +Epoch 1 | iter 2 step 0 | loss train: 2.004, val: n/a | iter time: 88.54 ms +... +Epoch 1 | iter 1899 step 237 | loss train: 1.238, val: 1.420 | iter time: 85.90 ms +Epoch 1 | iter 1900 step 237 | loss train: 1.313, val: 1.420 | iter time: 48.38 ms +Epoch 2 | iter 1901 step 237 | loss train: 1.422, val: 1.420 | iter time: 279.63 ms +Training time: 281.17s +Memory used: 9.44 GB +Saving adapter v2 weights to 'out/finetune/adapter-v2/final/lit_model.pth.adapter_v2' ``` + +The finetuning requires at least one GPU with ~10 GB memory. +You can speed up training by passing the `devices` argument to the script to utilize more GPUs if available. +Depending on the available GPU memory, you can also tune the `micro_batch_size` parameter to utilize the GPU efficiently. + This script will save checkpoints periodically to the `out_dir` directory. If you are finetuning different models or on your own dataset, you can specify an output directory with your preferred name: ```bash litgpt finetune adapter \ - --data Alpaca \ --out_dir out/adapter/my-model-finetuned ``` @@ -58,7 +105,6 @@ or for Adapter V2 ```bash litgpt finetune adapter_v2 \ - --data Alpaca \ --out_dir out/adapter_v2/my-model-finetuned ``` @@ -67,13 +113,15 @@ For instance, to fine-tune on MPS (the GPU on modern Macs), you can run ```bash litgpt finetune adapter \ - --data Alpaca \ + --data Alpaca2k \ --out_dir out/adapter/my-model-finetuned \ --precision 32-true ``` Note that `mps` as the accelerator will be picked up automatically by Fabric when running on a modern Mac. +  + ### Quantization Optionally, finetuning using quantization can be enabled via the `--quantize` flag, for example using the 4-bit NormalFloat data type: @@ -90,6 +138,8 @@ litgpt finetune adapter_v2 --quantize "bnb.nf4-dq" For additional benchmarks and resource requirements, please see the [Resource Tables](resource-tables.md). +  + ## Test the model You can test the finetuned model with your own instructions by running: @@ -116,6 +166,8 @@ A good movie to watch on the weekend would be The Lion King, since it's a classi If your GPU supports `bfloat16`, the script will automatically use it. +  + ## Tune on your dataset You can easily train on your own instruction dataset saved in JSON format. diff --git a/tutorials/finetune_full.md b/tutorials/finetune_full.md index 0a08256914..12f763a9b6 100644 --- a/tutorials/finetune_full.md +++ b/tutorials/finetune_full.md @@ -1,7 +1,8 @@ # Finetuning the whole model -If you are interested in parameter-efficient finetuning, check out [finetune_adapter.md](finetune_adapter.md). In contrast to parameter-efficient finetuning, this "full" approach finetunes all model parameters, which is substantially more expensive. It may only be recommended as a baseline for comparison studies. +If you are interested in parameter-efficient finetuning, check out [finetune_lora.md](finetune_lora.md) and [finetune_adapter.md](finetune_adapter.md). In contrast to parameter-efficient finetuning, this "full" approach finetunes all model parameters, which is substantially more expensive. It may only be recommended as a baseline for comparison studies. +  ## Preparation The steps here only need to be done once: @@ -9,19 +10,55 @@ The steps here only need to be done once: 1. Follow the instructions in the [README](../README.md) to install the dependencies. 2. Download and convert the weights following our [guide](download_model_weights.md). -LitGPT provides common datasets for finetuning, such as Alpaca, LIMA, Dolly, and more. +LitGPT provides common datasets for finetuning, such as Alpaca2k, Alpaca, LIMA, Dolly, and more. You can optionally [prepare your own dataset](#tune-on-your-dataset). For more information about dataset preparation, also see the [prepare_dataset.md](./prepare_dataset.md) tutorial. +For example, + +```bash +litgpt download --repo_id stabilityai/stablelm-base-alpha-3b +``` + +  ## Running the finetuning +The following with run the finetuning on the default model (`stabilityai/stablelm-base-alpha-3b`) on the Alpaca2k dataset: + ```bash -litgpt finetune full \ - --data Alpaca \ - --checkpoint_dir checkpoints/tiiuae/falcon-7b +litgpt finetune full --data Alpaca2k +``` + +The preceeding code will initiate the training, which will print the following outputs (via an A100 GPU): + +``` +{'checkpoint_dir': PosixPath('checkpoints/stabilityai/stablelm-base-alpha-3b'), +'data': Alpaca2k, +'devices': 1, +'eval': EvalArgs(interval=50, max_new_tokens=100, max_iters=100), +'logger_name': 'csv', +'out_dir': PosixPath('out/finetune/full'), +'precision': 'bf16-true', +'resume': False, +'seed': 1337, +'train': TrainArgs(save_interval=1000, log_interval=1, global_batch_size=16, micro_batch_size=1, lr_warmup_steps=1000, epochs=1, max_tokens=None, max_steps=None, max_seq_length=None, tie_embeddings=None, learning_rate=0.0002, weight_decay=0.02, beta1=0.9, beta2=0.95, max_norm=None, min_lr=6e-05)} +/home/sebastian/miniforge3/envs/litgpt/lib/python3.9/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3.9 /home/sebastian/miniforge3/envs/litgpt/bin/litgpt ... +Downloading data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.76M/1.76M [00:00<00:00, 3.40MB/s] +Generating train split: 2000 examples [00:00, 108714.24 examples/s] +Seed set to 1337 +Number of trainable parameters: 3,637,321,728 +The longest sequence length in the train data is 634, the model's maximum sequence length is 634 and context length is 4096 +... +Epoch 1 | iter 1 step 0 | loss train: 1.934, val: n/a | iter time: 190.05 ms +Epoch 1 | iter 2 step 0 | loss train: 2.014, val: n/a | iter time: 57.58 ms +... +Epoch 1 | iter 1900 step 118 | loss train: 1.215, val: 1.392 | iter time: 59.49 ms +Epoch 2 | iter 1901 step 118 | loss train: 1.271, val: 1.392 | iter time: 237.44 ms +Training time: 140.63s +Memory used: 36.51 GB ``` -Finetuning the falcon-7b model requires at least 8 GPUs with ~40 GB memory each. +Finetuning the 3B StableLM model requires a 1 GPU with at least 36.51 GB memory. If you only have a smaller GPU available, you can consider a parameter-efficient finetuning (for example, [LoRA](finetune_lora.md) or [Adapter](finetune_adapter.md) or [use a smaller LLM like 1.1B TinyLlama](download_model_weights.md). You can speed up training by passing the `devices` argument to the script to utilize more GPUs if available. Depending on the available GPU memory, you can also tune the `micro_batch_size` parameter to utilize the GPU efficiently. @@ -30,30 +67,28 @@ This script will save checkpoints periodically to the `out_dir` directory. If yo ```bash litgpt finetune full \ - --data Alpaca \ + --data Alpaca2k \ --out_dir out/full/my-model-finetuned ``` -If your GPU does not support `bfloat16`, you can pass the `--precision 32-true` argument. -For instance, to fine-tune on MPS (the GPU on modern Macs), you can run +**Tip** -```bash +You can find suggested configuration files for different types of models in the [config_hub/finetune](../config_hub/finetune) folder. +You can use these config files as follows, for example: + +``` litgpt finetune full \ - --data Alpaca \ - --out_dir out/full/my-model-finetuned \ - --precision 32-true + --config config_hub/finetune/tiny-llama/full.yaml ``` -Note that `mps` as the accelerator will be picked up automatically by Fabric when running on a modern Mac. - +  ## Test the model You can test the finetuned model with your own instructions by running: ```bash -litgpt generate full \ +litgpt generate base \ --prompt "Recommend a movie to watch on the weekend." \ - --checkpoint_dir checkpoints/tiiuae/falcon-7b \ --finetuned_path out/full/my-model-finetuned/lit_model_finetuned.pth ``` @@ -65,6 +100,7 @@ A good movie to watch on the weekend would be The Lion King, since it's a classi If your GPU supports `bfloat16`, the script will automatically use it. +  ## Tune on your dataset You can easily train on your own instruction dataset saved in JSON format. @@ -90,6 +126,5 @@ You can easily train on your own instruction dataset saved in JSON format. litgpt finetune full \ --data JSON \ --data.json_path data/mydata.json \ - --checkpoint_dir checkpoints/tiiuae/falcon-7b \ --out_dir data/mydata-finetuned ``` diff --git a/tutorials/finetune_lora.md b/tutorials/finetune_lora.md index 19c2201f3c..db1b42faba 100644 --- a/tutorials/finetune_lora.md +++ b/tutorials/finetune_lora.md @@ -1,7 +1,7 @@ # Finetuning with LoRA / QLoRA [Low-rank adaption (LoRA)](https://arxiv.org/abs/2106.09685) is a technique to approximate the update to the linear layers in a LLM with a low-rank matrix factorization. This significantly reduces the number of trainable parameters and speeds up training with little impact on the final performance of the model. -We demonstrate this method by instruction-finetuning LitGPT StableLM 3B on the [Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset on a **single RTX 3090 (24GB) GPU** with CUDA 11.8. +We demonstrate this method by instruction-finetuning LitGPT StableLM 3B on the Alpaca 2k dataset (a subset of [Alpcaca](https://github.com/tatsu-lab/stanford_alpaca)) on a **single RTX 3090 (24GB) GPU** with CUDA 11.8.   @@ -13,7 +13,13 @@ The steps here only need to be done once: 2. Download and convert the weights and save them in the `./checkpoints` folder. Weights can be downloaded following the instructions in the [download_model_weights](download_model_weights.md) documentation: -LitGPT provides common datasets for finetuning, such as Alpaca, LIMA, Dolly, and more. +For example, + +```bash +litgpt download --repo_id stablelm-base-alpha-3b +``` + +LitGPT provides common datasets for finetuning, such as Alpaca 2k, Alpaca, LIMA, Dolly, and more. You can optionally [prepare your own dataset](#tune-on-your-dataset). For more information about dataset preparation, also see the [prepare_dataset.md](./prepare_dataset.md) tutorial. @@ -21,11 +27,73 @@ For more information about dataset preparation, also see the [prepare_dataset.md ## Running the Finetuning +To finetune the default `"stablelm-base-alpha-3b"` model on Alpaca2k, run the following command: + ```bash -litgpt finetune lora --data Alpaca +litgpt finetune lora --data Alpaca2k +``` + +The preceeding code will initiate the training, which will print the following outputs (via an A10G GPU): + +``` +{'checkpoint_dir': PosixPath('checkpoints/stabilityai/stablelm-base-alpha-3b'), + 'data': Alpaca2k, + 'devices': 1, + 'eval': EvalArgs(interval=100, max_new_tokens=100, max_iters=100), + 'logger_name': 'csv', + 'lora_alpha': 16, + 'lora_dropout': 0.05, + 'lora_head': False, + 'lora_key': False, + 'lora_mlp': False, + 'lora_projection': False, + 'lora_query': True, + 'lora_r': 32, + 'lora_value': True, + 'out_dir': PosixPath('out/finetune/lora'), + 'precision': None, + 'quantize': None, + 'seed': 1337, + 'train': TrainArgs(save_interval=1000, + log_interval=1, + global_batch_size=8, + micro_batch_size=1, + lr_warmup_steps=100, + epochs=4, + max_tokens=None, + max_steps=None, + max_seq_length=None, + tie_embeddings=None, + learning_rate=0.0002, + weight_decay=0.02, + beta1=0.9, + beta2=0.95, + max_norm=None, + min_lr=6e-05)} +Seed set to 1337 +Number of trainable parameters: 8,388,608 +Number of non-trainable parameters: 3,637,321,728 +The longest sequence length in the train data is 634, the model's maximum sequence length is 634 and context length is 4096 +... + +Many people prefer to simply skip the Internet, but it is important to provide a list of new recommendations +Epoch 1 | iter 1 step 0 | loss train: 1.919, val: n/a | iter time: 270.43 ms +Epoch 1 | iter 2 step 0 | loss train: 2.004, val: n/a | iter time: 95.23 ms +Epoch 1 | iter 3 step 0 | loss train: 2.355, val: n/a | iter time: 70.50 ms +... +Epoch 4 | iter 7596 step 949 | loss train: 1.254, val: 1.365 | iter time: 62.84 ms +Epoch 4 | iter 7597 step 949 | loss train: 1.283, val: 1.365 | iter time: 125.17 ms +Epoch 4 | iter 7598 step 949 | loss train: 1.272, val: 1.365 | iter time: 88.44 ms +Epoch 4 | iter 7599 step 949 | loss train: 1.212, val: 1.365 | iter time: 62.70 ms +Epoch 4 | iter 7600 step 950 | loss train: 1.117, val: 1.365 | iter time: 61.85 ms (step) +Epoch 5 | iter 7601 step 950 | loss train: 1.032, val: 1.365 | iter time: 198.48 ms +Training time: 780.54s +Memory used: 8.86 GB +Saving LoRA weights to 'out/finetune/lora/final/lit_model.pth.lora' +Saved merged weights to 'out/finetune/lora/final/lit_model.pth' ``` -The finetuning requires at least one GPU with ~24 GB memory (RTX 3090). +The finetuning requires at least one GPU with ~9 GB GPU memory. This script will save checkpoints periodically to the folder `out/`. @@ -50,8 +118,8 @@ The table below lists a comparison with different settings on a StableLM 3B mode | Settings | Training Memory | Training Time | Inference Memory | |---------------------------------------------|-----------------|---------------|-------------------| -| Default (bf16-mixed) | 26.92 GB | 1.34 min | 21.43 GB | -| --precision bf16-true | 9.69 GB | 1.24 min | 7.30 GB | +| --precision bf16-mixed | 26.92 GB | 1.34 min | 21.43 GB | +| --precision bf16-true (default) | 9.69 GB | 1.24 min | 7.30 GB | | --precision bf16-true --quantize bnb.nf4 | 6.35 GB | 1.82 min | 3.20 GB | | --precision bf16-true --quantize bnb.nf4-dq | 6.19 GB | 1.87 min | 3.04 GB | @@ -59,12 +127,24 @@ The advantages of QLoRA-style quantization are more pronounced in larger models, | Settings | Training Memory | Training Time | Inference Memory | |---------------------------------------------|------------------|---------------|------------------| -| Default (bf16-mixed) | OutOfMemoryError | N/A | 40.21 GB | -| --precision bf16-true | 21.30 GB | 2.36 min | 13.52 GB | +| --precision bf16-mixed | OutOfMemoryError | N/A | 40.21 GB | +| --precision bf16-true (default) | 21.30 GB | 2.36 min | 13.52 GB | | --precision bf16-true --quantize bnb.nf4 | 14.14 GB | 3.68 min | 4.57 GB | | --precision bf16-true --quantize bnb.nf4-dq | 13.84 GB | 3.83 min | 4.26 GB | -For additional benchmarks and resource requirements, please see the [Resource Tables](resource-tables.md). +For additional benchmarks and resource requirements, please see the [Resource Tables](resource-tables.md). + +  + +**Tip** + +You can find suggested configuration files for different types of models in the [config_hub/finetune](../config_hub/finetune) folder. +You can use these config files as follows, for example: + +``` +litgpt finetune lora \ + --config config_hub/finetune/stablelm-base-alpha-3b/lora.yaml +```   @@ -84,7 +164,7 @@ Output: I would recommend the movie The Martian (2015). It is a sci-fi movie starring Matt Damon that follows the story of... ``` -If your GPU supports `bfloat16`, you can additionally pass `--precision "bf16-true"` to bring the memory consumption down to ~7.6 GB for StableLM-3B (versus ~15.2 GB for `--precision "32-full"`). In addition, you may use quantization methods, for example `--precision "bf16-true" --quantize "bnb.nf4"` brings the memory consumption further down to ~4.4 GB for StableLM-3B. +You can additionally pass use quantization methods, for example `--quantize "bnb.nf4"`, which brings the memory consumption further down to ~4.4 GB for StableLM-3B.