diff --git a/finetune/adapter.py b/finetune/adapter.py index ea16344fca..cab6d78408 100644 --- a/finetune/adapter.py +++ b/finetune/adapter.py @@ -4,11 +4,12 @@ import sys import time from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Literal, Optional, Tuple import lightning as L import torch from lightning.fabric.loggers import CSVLogger +from lightning.fabric.plugins import BitsandbytesPrecision from lightning.fabric.strategies import FSDPStrategy from lightning.fabric.utilities import ThroughputMonitor @@ -23,7 +24,7 @@ check_valid_checkpoint_dir, chunked_cross_entropy, get_default_supported_precision, - lazy_load, + load_checkpoint, num_parameters, ) from scripts.prepare_alpaca import generate_prompt @@ -56,11 +57,24 @@ def setup( checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), out_dir: Path = Path("out/adapter/alpaca"), precision: Optional[str] = None, + quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None, ) -> None: precision = precision or get_default_supported_precision(training=True) - fabric_devices = devices - if fabric_devices > 1: + plugins = None + if quantize is not None and quantize.startswith("bnb."): + if "mixed" in precision: + raise ValueError("Quantization and mixed precision is not supported.") + dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + if devices > 1: + if quantize: + raise NotImplementedError( + "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the" + " --quantize flag." + ) strategy = FSDPStrategy( auto_wrap_policy={Block}, activation_checkpointing_policy={Block}, @@ -72,7 +86,7 @@ def setup( strategy = "auto" logger = CSVLogger(out_dir.parent, out_dir.name, flush_logs_every_n_steps=log_interval) - fabric = L.Fabric(devices=fabric_devices, strategy=strategy, precision=precision, loggers=logger) + fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins) fabric.print(hparams) fabric.launch(main, data_dir, checkpoint_dir, out_dir) @@ -91,20 +105,26 @@ def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path) config = Config.from_name(name=checkpoint_dir.name) checkpoint_path = checkpoint_dir / "lit_model.pth" fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") - with fabric.init_module(empty_init=False): + with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) - checkpoint = lazy_load(checkpoint_path) - # strict=False because missing keys due to adapter weights not contained in state dict - model.load_state_dict(checkpoint, strict=False) - mark_only_adapter_as_trainable(model) fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}") fabric.print(f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}") + + model = fabric.setup_module(model) + trainable_params = [p for p in model.parameters() if p.requires_grad] + if isinstance(fabric.strategy.precision, BitsandbytesPrecision): + import bitsandbytes as bnb + + optimizer = bnb.optim.PagedAdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay) + else: + optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay) + optimizer = fabric.setup_optimizers(optimizer) - optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay) - model, optimizer = fabric.setup(model, optimizer) + # strict=False because missing keys due to Adapter weights not contained in state dict + load_checkpoint(fabric, model, checkpoint_path, strict=False) fabric.seed_everything(1337 + fabric.global_rank) diff --git a/finetune/adapter_v2.py b/finetune/adapter_v2.py index 3095210ca9..89d16f790c 100644 --- a/finetune/adapter_v2.py +++ b/finetune/adapter_v2.py @@ -4,11 +4,12 @@ import sys import time from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Literal, Optional, Tuple import lightning as L import torch from lightning.fabric.loggers import CSVLogger +from lightning.fabric.plugins import BitsandbytesPrecision from lightning.fabric.strategies import FSDPStrategy from lightning.fabric.utilities import ThroughputMonitor @@ -23,7 +24,7 @@ check_valid_checkpoint_dir, chunked_cross_entropy, get_default_supported_precision, - lazy_load, + load_checkpoint, num_parameters, ) from scripts.prepare_alpaca import generate_prompt @@ -56,11 +57,24 @@ def setup( checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), out_dir: Path = Path("out/adapter_v2/alpaca"), precision: Optional[str] = None, + quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None, ) -> None: precision = precision or get_default_supported_precision(training=True) - fabric_devices = devices - if fabric_devices > 1: + plugins = None + if quantize is not None and quantize.startswith("bnb."): + if "mixed" in precision: + raise ValueError("Quantization and mixed precision is not supported.") + dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + if devices > 1: + if quantize: + raise NotImplementedError( + "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the" + " --quantize flag." + ) strategy = FSDPStrategy( auto_wrap_policy={Block}, activation_checkpointing_policy={Block}, @@ -72,7 +86,7 @@ def setup( strategy = "auto" logger = CSVLogger(out_dir.parent, out_dir.name, flush_logs_every_n_steps=log_interval) - fabric = L.Fabric(devices=fabric_devices, strategy=strategy, precision=precision, loggers=logger) + fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins) fabric.print(hparams) fabric.launch(main, data_dir, checkpoint_dir, out_dir) @@ -91,20 +105,26 @@ def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path) config = Config.from_name(name=checkpoint_dir.name) checkpoint_path = checkpoint_dir / "lit_model.pth" fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") - with fabric.init_module(empty_init=False): + with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) - checkpoint = lazy_load(checkpoint_path) - # strict=False because missing keys due to adapter weights not contained in state dict - model.load_state_dict(checkpoint, strict=False) - mark_only_adapter_v2_as_trainable(model) fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}") fabric.print(f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}") + + model = fabric.setup_module(model) + trainable_params = [p for p in model.parameters() if p.requires_grad] + if isinstance(fabric.strategy.precision, BitsandbytesPrecision): + import bitsandbytes as bnb + + optimizer = bnb.optim.PagedAdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay) + else: + optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay) + optimizer = fabric.setup_optimizers(optimizer) - optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay) - model, optimizer = fabric.setup(model, optimizer) + # strict=False because missing keys due to Adapter weights not contained in state dict + load_checkpoint(fabric, model, checkpoint_path, strict=False) fabric.seed_everything(1337 + fabric.global_rank) diff --git a/tests/test_adapter.py b/tests/test_adapter.py index 64da985741..c182c5d0de 100644 --- a/tests/test_adapter.py +++ b/tests/test_adapter.py @@ -1,13 +1,14 @@ # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. - from contextlib import redirect_stdout from dataclasses import asdict from io import StringIO from unittest.mock import Mock +import pytest import torch from conftest import RunIf from lightning import Fabric +from lightning.fabric.wrappers import _FabricOptimizer def test_config_identical(): @@ -67,8 +68,7 @@ def test_adapter_script(tmp_path, fake_checkpoint_dir, monkeypatch): model_config = dict(block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8, adapter_start_layer=0) monkeypatch.setitem(name_to_config, "tmp", model_config) - monkeypatch.setattr(module, "lazy_load", Mock()) - monkeypatch.setattr(module.GPT, "load_state_dict", Mock()) + monkeypatch.setattr(module, "load_checkpoint", Mock()) tokenizer_mock = Mock() tokenizer_mock.return_value = tokenizer_mock @@ -129,3 +129,110 @@ def test_adapter_compile(): assert isinstance(explanation, debugging.ExplainOutput) assert explanation.graph_count == 1 assert explanation.graph_break_count == 0 + + +@RunIf(min_cuda_gpus=1) +# platform dependent cuda issue: libbitsandbytes_cpu.so: undefined symbol: cquantize_blockwise_fp16_nf4 +@pytest.mark.xfail(raises=AttributeError, strict=False) +def test_adapter_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir): + from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE, BitsandbytesPrecision + + if not _BITSANDBYTES_AVAILABLE: + pytest.skip("BNB not available") + + from bitsandbytes.optim import PagedAdamW + + import finetune.adapter as module + + data = [] + torch.save(data, tmp_path / "train.pt") + torch.save(data, tmp_path / "test.pt") + + from lit_gpt.config import name_to_config + + model_config = dict( + block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8, adapter_start_layer=0, bias=True + ) + monkeypatch.setitem(name_to_config, "tmp", model_config) + + monkeypatch.setattr(module, "load_checkpoint", Mock()) + train_mock = Mock() + monkeypatch.setattr(module, "train", train_mock) + + stdout = StringIO() + with redirect_stdout(stdout): + module.setup( + data_dir=tmp_path, + checkpoint_dir=fake_checkpoint_dir, + out_dir=tmp_path, + precision="16-true", + quantize="bnb.nf4-dq", + ) + + args, kwargs = train_mock.call_args + fabric, model, optimizer, *_ = args + assert isinstance(fabric.strategy.precision, BitsandbytesPrecision) + assert isinstance(optimizer, _FabricOptimizer) + assert isinstance(optimizer._optimizer, PagedAdamW) + + dtype_to_name = {"torch.uint8": set(), "torch.float16": set()} + for name, layer in model.named_parameters(): + name = name[len("_forward_module.") :] + dtype_to_name[str(layer.dtype)].add(name) + assert dtype_to_name == { + "torch.float16": { + "transformer.wte.weight", + "transformer.h.0.norm_1.weight", + "transformer.h.0.norm_1.bias", + "transformer.h.0.attn.gating_factor", + "transformer.h.0.attn.attn.bias", + "transformer.h.0.attn.proj.bias", + "transformer.h.0.attn.adapter_wte.weight", + "transformer.h.0.norm_2.weight", + "transformer.h.0.norm_2.bias", + "transformer.h.0.mlp.fc.bias", + "transformer.h.0.mlp.proj.bias", + "transformer.h.1.norm_1.weight", + "transformer.h.1.norm_1.bias", + "transformer.h.1.attn.gating_factor", + "transformer.h.1.attn.attn.bias", + "transformer.h.1.attn.proj.bias", + "transformer.h.1.attn.adapter_wte.weight", + "transformer.h.1.norm_2.weight", + "transformer.h.1.norm_2.bias", + "transformer.h.1.mlp.fc.bias", + "transformer.h.1.mlp.proj.bias", + "transformer.ln_f.weight", + "transformer.ln_f.bias", + }, + "torch.uint8": { + "lm_head.weight", + "transformer.h.0.attn.attn.weight", + "transformer.h.0.attn.proj.weight", + "transformer.h.0.mlp.fc.weight", + "transformer.h.0.mlp.proj.weight", + "transformer.h.1.attn.attn.weight", + "transformer.h.1.attn.proj.weight", + "transformer.h.1.mlp.fc.weight", + "transformer.h.1.mlp.proj.weight", + }, + } + + assert {p.name for p in tmp_path.glob("*.pth")} == {"lit_model_adapter_finetuned.pth"} + state_dict = torch.load(tmp_path / "lit_model_adapter_finetuned.pth") + assert len(state_dict) == 1 + dtype_to_name = {"torch.float16": set()} + for name, layer in state_dict["model"].items(): + dtype_to_name[str(layer.dtype)].add(name) + assert dtype_to_name == { + "torch.float16": { + "transformer.h.0.attn.adapter_wte.weight", + "transformer.h.0.attn.gating_factor", + "transformer.h.1.attn.adapter_wte.weight", + "transformer.h.1.attn.gating_factor", + } + } + + logs = stdout.getvalue() + assert "of trainable parameters: 168" in logs + assert "of non trainable parameters: 1,888" in logs diff --git a/tests/test_adapter_v2.py b/tests/test_adapter_v2.py index 214040ba9f..6c961927ca 100644 --- a/tests/test_adapter_v2.py +++ b/tests/test_adapter_v2.py @@ -10,6 +10,7 @@ import torch from conftest import RunIf from lightning import Fabric +from lightning.fabric.wrappers import _FabricOptimizer # support running without installing as a package wd = Path(__file__).parent.parent.resolve() @@ -91,8 +92,7 @@ def test_adapter_v2_script(tmp_path, fake_checkpoint_dir, monkeypatch): model_config = dict(block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8, adapter_start_layer=0) monkeypatch.setitem(name_to_config, "tmp", model_config) - monkeypatch.setattr(module, "lazy_load", Mock()) - monkeypatch.setattr(module.GPT, "load_state_dict", Mock()) + monkeypatch.setattr(module, "load_checkpoint", Mock()) tokenizer_mock = Mock() tokenizer_mock.return_value = tokenizer_mock @@ -219,3 +219,156 @@ def test_against_hf_mixtral(): ours_y = ours_model(x) theirs_y = theirs_model(x)["logits"].to(dtype) # HF converts logits to float torch.testing.assert_close(ours_y, theirs_y) + + +@RunIf(min_cuda_gpus=1) +# platform dependent cuda issue: libbitsandbytes_cpu.so: undefined symbol: cquantize_blockwise_fp16_nf4 +@pytest.mark.xfail(raises=AttributeError, strict=False) +def test_adapter_v2_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir): + from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE, BitsandbytesPrecision + + if not _BITSANDBYTES_AVAILABLE: + pytest.skip("BNB not available") + + from bitsandbytes.optim import PagedAdamW + + import finetune.adapter_v2 as module + + data = [] + torch.save(data, tmp_path / "train.pt") + torch.save(data, tmp_path / "test.pt") + + from lit_gpt.config import name_to_config + + model_config = dict( + block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8, adapter_start_layer=0, bias=True + ) + monkeypatch.setitem(name_to_config, "tmp", model_config) + + monkeypatch.setattr(module, "load_checkpoint", Mock()) + train_mock = Mock() + monkeypatch.setattr(module, "train", train_mock) + + stdout = StringIO() + with redirect_stdout(stdout): + module.setup( + data_dir=tmp_path, + checkpoint_dir=fake_checkpoint_dir, + out_dir=tmp_path, + precision="16-true", + quantize="bnb.nf4-dq", + ) + + args, kwargs = train_mock.call_args + fabric, model, optimizer, *_ = args + assert isinstance(fabric.strategy.precision, BitsandbytesPrecision) + assert isinstance(optimizer, _FabricOptimizer) + assert isinstance(optimizer._optimizer, PagedAdamW) + + dtype_to_name = {"torch.uint8": set(), "torch.float16": set()} + for name, layer in model.named_parameters(): + name = name[len("_forward_module.") :] + dtype_to_name[str(layer.dtype)].add(name) + assert dtype_to_name == { + "torch.uint8": { + "transformer.h.0.mlp.fc.linear.weight", + "transformer.h.1.mlp.proj.linear.weight", + "transformer.h.1.attn.attn.linear.weight", + "transformer.h.0.attn.proj.linear.weight", + "lm_head.linear.weight", + "transformer.h.1.attn.proj.linear.weight", + "transformer.h.0.mlp.proj.linear.weight", + "transformer.h.0.attn.attn.linear.weight", + "transformer.h.1.mlp.fc.linear.weight", + }, + "torch.float16": { + "transformer.h.1.attn.attn.adapter_bias", + "transformer.h.1.mlp.proj.adapter_bias", + "transformer.h.0.attn.attn.adapter_bias", + "transformer.h.0.norm_1.bias", + "transformer.h.0.attn.attn.linear.bias", + "transformer.h.1.attn.adapter_wte.weight", + "transformer.ln_f.weight", + "transformer.h.0.mlp.fc.linear.bias", + "transformer.h.0.mlp.proj.linear.bias", + "transformer.h.1.mlp.fc.linear.bias", + "transformer.h.0.attn.proj.adapter_scale", + "transformer.h.0.attn.attn.adapter_scale", + "transformer.h.1.norm_2.bias", + "transformer.h.1.attn.proj.adapter_scale", + "transformer.h.0.norm_2.bias", + "transformer.h.0.mlp.fc.adapter_scale", + "transformer.h.0.attn.proj.linear.bias", + "transformer.h.1.attn.proj.linear.bias", + "transformer.h.1.norm_1.bias", + "transformer.h.0.norm_1.weight", + "transformer.h.1.attn.proj.adapter_bias", + "transformer.h.0.mlp.proj.adapter_scale", + "transformer.h.0.mlp.proj.adapter_bias", + "transformer.h.1.mlp.fc.adapter_bias", + "transformer.h.1.mlp.proj.adapter_scale", + "transformer.h.1.attn.gating_factor", + "transformer.h.1.norm_1.weight", + "transformer.ln_f.bias", + "transformer.h.0.mlp.fc.adapter_bias", + "lm_head.adapter_scale", + "lm_head.adapter_bias", + "transformer.h.1.norm_2.weight", + "transformer.h.0.attn.adapter_wte.weight", + "transformer.h.1.attn.attn.adapter_scale", + "transformer.h.1.mlp.fc.adapter_scale", + "transformer.h.1.attn.attn.linear.bias", + "transformer.wte.weight", + "transformer.h.0.norm_2.weight", + "transformer.h.1.mlp.proj.linear.bias", + "transformer.h.0.attn.gating_factor", + "transformer.h.0.attn.proj.adapter_bias", + }, + } + + assert {p.name for p in tmp_path.glob("*.pth")} == {"lit_model_adapter_finetuned.pth"} + state_dict = torch.load(tmp_path / "lit_model_adapter_finetuned.pth") + assert len(state_dict) == 1 + dtype_to_name = {"torch.float16": set()} + for name, layer in state_dict["model"].items(): + dtype_to_name[str(layer.dtype)].add(name) + assert dtype_to_name == { + "torch.float16": { + "transformer.h.1.attn.adapter_wte.weight", + "transformer.h.1.attn.proj.adapter_bias", + "transformer.h.1.mlp.fc.adapter_scale", + "lm_head.adapter_bias", + "transformer.h.0.mlp.proj.adapter_scale", + "transformer.ln_f.bias", + "lm_head.adapter_scale", + "transformer.h.1.norm_2.weight", + "transformer.h.0.attn.attn.adapter_scale", + "transformer.h.0.mlp.proj.adapter_bias", + "transformer.h.0.attn.gating_factor", + "transformer.h.1.norm_1.bias", + "transformer.h.1.mlp.fc.adapter_bias", + "transformer.h.1.mlp.proj.adapter_scale", + "transformer.h.0.mlp.fc.adapter_scale", + "transformer.h.1.attn.attn.adapter_bias", + "transformer.h.0.norm_2.weight", + "transformer.h.1.norm_2.bias", + "transformer.h.0.norm_1.weight", + "transformer.h.0.attn.proj.adapter_scale", + "transformer.h.1.mlp.proj.adapter_bias", + "transformer.h.0.attn.attn.adapter_bias", + "transformer.h.0.attn.adapter_wte.weight", + "transformer.ln_f.weight", + "transformer.h.1.attn.gating_factor", + "transformer.h.0.mlp.fc.adapter_bias", + "transformer.h.1.attn.proj.adapter_scale", + "transformer.h.0.attn.proj.adapter_bias", + "transformer.h.0.norm_1.bias", + "transformer.h.0.norm_2.bias", + "transformer.h.1.norm_1.weight", + "transformer.h.1.attn.attn.adapter_scale", + } + } + + logs = stdout.getvalue() + assert "of trainable parameters: 552" in logs + assert "of non trainable parameters: 1,808" in logs diff --git a/tests/test_lora.py b/tests/test_lora.py index 1d9364169b..da9967a52b 100644 --- a/tests/test_lora.py +++ b/tests/test_lora.py @@ -11,6 +11,7 @@ import torch from conftest import RunIf from lightning import Fabric +from lightning.fabric.wrappers import _FabricOptimizer # support running without installing as a package wd = Path(__file__).parent.parent.resolve() @@ -362,7 +363,7 @@ def test_lora_qkv_linear_weights_merged_status(rank, enable_lora, expected_merge @RunIf(min_cuda_gpus=1) # platform dependent cuda issue: libbitsandbytes_cpu.so: undefined symbol: cquantize_blockwise_fp16_nf4 @pytest.mark.xfail(raises=AttributeError, strict=False) -def test_lora_merge_with_quantize(): +def test_lora_merge_with_bitsandbytes(): from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE, BitsandbytesPrecision if not _BITSANDBYTES_AVAILABLE: @@ -548,3 +549,121 @@ def test_against_hf_mixtral(): ours_y = ours_model(x) theirs_y = theirs_model(x)["logits"].to(dtype) # HF converts logits to float torch.testing.assert_close(ours_y, theirs_y) + + +@RunIf(min_cuda_gpus=1) +# platform dependent cuda issue: libbitsandbytes_cpu.so: undefined symbol: cquantize_blockwise_fp16_nf4 +@pytest.mark.xfail(raises=AttributeError, strict=False) +def test_lora_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir): + from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE, BitsandbytesPrecision + + if not _BITSANDBYTES_AVAILABLE: + pytest.skip("BNB not available") + + from bitsandbytes.optim import PagedAdamW + + import finetune.lora as module + + data = [] + torch.save(data, tmp_path / "train.pt") + torch.save(data, tmp_path / "test.pt") + + from lit_gpt.config import name_to_config + + model_config = dict( + block_size=128, + n_layer=2, + n_embd=8, + n_head=4, + padded_vocab_size=8, + bias=True, + r=8, + alpha=8, + dropout=0.1, + to_query=True, + to_value=True, + to_projection=True, + ) + monkeypatch.setitem(name_to_config, "tmp", model_config) + + monkeypatch.setattr(module, "load_checkpoint", Mock()) + train_mock = Mock() + monkeypatch.setattr(module, "train", train_mock) + + stdout = StringIO() + with redirect_stdout(stdout): + module.setup( + data_dir=tmp_path, + checkpoint_dir=fake_checkpoint_dir, + out_dir=tmp_path, + precision="16-true", + quantize="bnb.nf4-dq", + ) + + args, kwargs = train_mock.call_args + fabric, model, optimizer, *_ = args + assert isinstance(fabric.strategy.precision, BitsandbytesPrecision) + assert isinstance(optimizer, _FabricOptimizer) + assert isinstance(optimizer._optimizer, PagedAdamW) + + dtype_to_name = {"torch.uint8": set(), "torch.float16": set()} + for name, layer in model.named_parameters(): + name = name[len("_forward_module.") :] + dtype_to_name[str(layer.dtype)].add(name) + assert dtype_to_name == { + "torch.uint8": { + "transformer.h.0.attn.attn.linear.weight", + "transformer.h.0.attn.proj.linear.weight", + "transformer.h.0.mlp.fc.linear.weight", + "transformer.h.1.mlp.proj.linear.weight", + "transformer.h.0.mlp.proj.linear.weight", + "transformer.h.1.attn.attn.linear.weight", + "lm_head.linear.weight", + "transformer.h.1.attn.proj.linear.weight", + "transformer.h.1.mlp.fc.linear.weight", + }, + "torch.float16": { + "transformer.h.0.attn.attn.lora_B", + "transformer.h.0.norm_2.weight", + "transformer.wte.weight", + "transformer.h.1.mlp.fc.linear.bias", + "transformer.ln_f.bias", + "transformer.h.1.attn.attn.lora_B", + "transformer.h.1.attn.proj.linear.bias", + "transformer.h.1.norm_1.weight", + "transformer.h.1.attn.attn.linear.bias", + "transformer.h.1.attn.attn.lora_A", + "transformer.h.1.norm_1.bias", + "transformer.h.1.norm_2.bias", + "transformer.h.0.attn.proj.linear.bias", + "transformer.h.0.norm_1.bias", + "transformer.h.0.mlp.proj.linear.bias", + "transformer.h.0.mlp.fc.linear.bias", + "transformer.h.0.norm_2.bias", + "transformer.ln_f.weight", + "transformer.h.0.attn.attn.lora_A", + "transformer.h.1.norm_2.weight", + "transformer.h.1.mlp.proj.linear.bias", + "transformer.h.0.norm_1.weight", + "transformer.h.0.attn.attn.linear.bias", + }, + } + + assert {p.name for p in tmp_path.glob("*.pth")} == {"lit_model_lora_finetuned.pth"} + state_dict = torch.load(tmp_path / "lit_model_lora_finetuned.pth") + assert len(state_dict) == 1 + dtype_to_name = {"torch.float16": set()} + for name, layer in state_dict["model"].items(): + dtype_to_name[str(layer.dtype)].add(name) + assert dtype_to_name == { + "torch.float16": { + "transformer.h.1.attn.attn.lora_A", + "transformer.h.0.attn.attn.lora_A", + "transformer.h.0.attn.attn.lora_B", + "transformer.h.1.attn.attn.lora_B", + } + } + + logs = stdout.getvalue() + assert "of trainable parameters: 512" in logs + assert "of non trainable parameters: 1,888" in logs diff --git a/tutorials/finetune_adapter.md b/tutorials/finetune_adapter.md index 6b3e120cf4..916e25af8b 100644 --- a/tutorials/finetune_adapter.md +++ b/tutorials/finetune_adapter.md @@ -69,6 +69,22 @@ python finetune/adapter.py --out_dir out/adapter/my-model-finetuned --precision Note that `mps` as the accelerator will be picked up automatically by Fabric when running on a modern Mac. +### Quantization + +Optionally, finetuning using quantization can be enabled via the `--quantize` flag, for example using the 4-bit NormalFloat data type: + +```bash +python finetune/adapter.py --quantize "bnb.nf4" +``` + +or using adapter_v2 with double-quantization: + +```bash +python finetune/adapter_v2.py --quantize "bnb.nf4-dq" +``` + +For additional benchmarks and resource requirements, please see the [Resource Tables](resource-tables.md). + ## Test the model You can test the finetuned model with your own instructions by running: diff --git a/tutorials/resource-tables.md b/tutorials/resource-tables.md index c9d55ce6b8..e9cde2a109 100644 --- a/tutorials/resource-tables.md +++ b/tutorials/resource-tables.md @@ -6,10 +6,7 @@ - OS: Ubuntu 22.04.3 LTS (x86_64) - Nvidia driver version: 525.125.06 - Relevant libraries - - CMake 3.26.4 - - Libc glibc-2.35 - PyTorch 2.1.0+cu121 - - Lightning 2.1.0.rc0 - Bitsandbytes 0.41.1 This document provides an overview and examples of hardware requirements when running models in Lit-GPT. @@ -39,35 +36,63 @@ Note that the number of tokens in the training set does not affect the supported The following experiments were conducted on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script. -| Size | Model | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | -|-------|----------------|--------------|-----------------|----------------------|-------------|--------------------|------------------------------| -| 1.3 B | phi-1.5 | None | 1 | 1,572,864 | 4.82 GB | 1.62 min | 80.91 min | -| 1.3 B | phi-1.5 | bnb.nf4 | 1 | 1,572,864 | 3.78 GB | 1.77 min | 88.36 min | -| 1.3 B | phi-1.5 | bnb.nf4-dq | 1 | 1,572,864 | 3.72 GB | 1.87 min | 93.39 min | -| 1.3 B | phi-1.5 | None | 2 | 1,572,864 | 6.76 GB | 1.65 min | 82.44 min | -| 1.3 B | phi-1.5 | None | 4 | 1,572,864 | 10.68 GB | 1.70 min | 84.79 min | -| | | | | | | | | -| 3 B | StableLM Alpha | None | 1 | 2,097,152 | 9.69 GB | 1.24 min | 62.23 min | -| 3 B | StableLM Alpha | bnb.nf4 | 1 | 2,097,152 | 6.35 GB | 1.82 min | 91.22 min | -| 3 B | StableLM Alpha | bnb.nf4-dq | 1 | 2,097,152 | 6.19 GB | 1.87 min | 93.58 min | -| 3 B | StableLM Alpha | None | 2 | 2,097,152 | 12.10 GB | 1.33 min | 66.68 min | -| 3 B | StableLM Alpha | None | 4 | 2,097,152 | 16.92 GB | 1.50 min | 74.89 min | -| | | | | | | | | -| 7 B | Llama 2 | None | 1 | 4,194,304 | 21.30 GB | 2.36 min | 118.03 min | -| 7 B | Llama 2 | bnb.nf4 | 1 | 4,194,304 | 14.14 GB | 3.68 min | 183.88 min | -| 7 B | Llama 2 | bnb.nf4-dq | 1 | 4,194,304 | 13.84 GB | 3.83 min | 191.66 min | -| 7 B | Llama 2 | None | 2 | 4,194,304 | 29.07 GB | 2.52 min | 125.97 min | -| 7 B | Llama 2 | None | 4 | 4,194,304 | OOM | - | - | -| | | | | | | | | -| 13 B | Llama 2 | None | 1 | 6,553,600 | 38.12 GB | 3.19 min | 159.43 min | -| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 23.14 GB | 6.38 min | 319.03 min | -| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 22.55 GB | 6.55 min | 327.32 min | -| 13 B | Llama 2 | None | 2 | 6,553,600 | OOM | - | - | -| 13 B | Llama 2 | None | 4 | 6,553,600 | OOM | - | - | -| | | | | | | | | -| 40 B | Falcon | None | 1 | 12,042,240 | OOM | - | - | -| 40 B | Falcon | bnb.nf4 | 1 | 12,042,240 | OOM | - | - | -| 40 B | Falcon | bnb.nf4-dq | 1 | 12,042,240 | OOM | - | - | +| Size | Model | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | +|-------|----------------|--------------|-----------------|----------------------|-------------|--------------------| +| 1.3 B | phi-1.5 | None | 1 | 1,572,864 | 4.82 GB | 1.62 min | +| 1.3 B | phi-1.5 | bnb.nf4 | 1 | 1,572,864 | 3.78 GB | 1.77 min | +| 1.3 B | phi-1.5 | bnb.nf4-dq | 1 | 1,572,864 | 3.72 GB | 1.87 min | +| 1.3 B | phi-1.5 | None | 2 | 1,572,864 | 6.76 GB | 1.65 min | +| 1.3 B | phi-1.5 | None | 4 | 1,572,864 | 10.68 GB | 1.70 min | +| | | | | | | | +| 3 B | StableLM Alpha | None | 1 | 2,097,152 | 9.69 GB | 1.24 min | +| 3 B | StableLM Alpha | bnb.nf4 | 1 | 2,097,152 | 6.35 GB | 1.82 min | +| 3 B | StableLM Alpha | bnb.nf4-dq | 1 | 2,097,152 | 6.19 GB | 1.87 min | +| 3 B | StableLM Alpha | None | 2 | 2,097,152 | 12.10 GB | 1.33 min | +| 3 B | StableLM Alpha | None | 4 | 2,097,152 | 16.92 GB | 1.50 min | +| | | | | | | | +| 7 B | Llama 2 | None | 1 | 4,194,304 | 21.30 GB | 2.36 min | +| 7 B | Llama 2 | bnb.nf4 | 1 | 4,194,304 | 14.14 GB | 3.68 min | +| 7 B | Llama 2 | bnb.nf4-dq | 1 | 4,194,304 | 13.84 GB | 3.83 min | +| 7 B | Llama 2 | None | 2 | 4,194,304 | 29.07 GB | 2.52 min | +| 7 B | Llama 2 | None | 4 | 4,194,304 | OOM | - | +| | | | | | | | +| 13 B | Llama 2 | None | 1 | 6,553,600 | 38.12 GB | 3.19 min | +| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 23.14 GB | 6.38 min | +| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 22.55 GB | 6.55 min | +| 13 B | Llama 2 | None | 2 | 6,553,600 | OOM | - | +| 13 B | Llama 2 | None | 4 | 6,553,600 | OOM | - | +| | | | | | | | +| 40 B | Falcon | None | 1 | 12,042,240 | OOM | - | +| 40 B | Falcon | bnb.nf4 | 1 | 12,042,240 | OOM | - | +| 40 B | Falcon | bnb.nf4-dq | 1 | 12,042,240 | OOM | - | + +  + +## Finetuning with Adapter on 1 GPU + +The following experiments were conducted on 1xA100 with a minibatch size of 128 using the `finetune/adapter.py` script. + +| Size | Model | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | +|------|----------------|--------------|-----------------|----------------------|-------------|--------------------| +| 3 B | StableLM Alpha | None | 1 | 573,888 | 9.10 GB | 0.74 min | +| 3 B | StableLM Alpha | bnb.nf4 | 1 | 573,888 | 5.65 GB | 1.38 min | +| 3 B | StableLM Alpha | bnb.nf4-dq | 1 | 573,888 | 5.48 GB | 1.46 min | +| | | | | | | | +| 7 B | Llama 2 | None | 1 | 1,229,760 | 19.98 GB | 1.50 min | +| 7 B | Llama 2 | bnb.nf4 | 1 | 1,229,760 | 12.68 GB | 2.93 min | +| 7 B | Llama 2 | bnb.nf4-dq | 1 | 1,229,760 | 12.38 GB | 3.00 min | + +The same config, but using the `finetune/adapter_v2.py` script. + +| Size | Model | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | +|------|----------------|--------------|-----------------|----------------------|-------------|--------------------| +| 3 B | StableLM Alpha | None | 1 | 2,125,248 | 10.71 GB | 0.87 min | +| 3 B | StableLM Alpha | bnb.nf4 | 1 | 2,125,248 | 7.41 GB | 1.59 min | +| 3 B | StableLM Alpha | bnb.nf4-dq | 1 | 2,125,248 | 7.25 GB | 1.62 min | +| | | | | | | | +| 7 B | Llama 2 | None | 1 | 4,279,744 | 25.51 GB | 1.81 min | +| 7 B | Llama 2 | bnb.nf4 | 1 | 4,279,744 | 18.30 GB | 3.23 min | +| 7 B | Llama 2 | bnb.nf4-dq | 1 | 4,279,744 | 17.98 GB | 3.32 min |   @@ -75,28 +100,28 @@ The following experiments were conducted on 1xA100 with a minibatch size of 128 The following experiments were conducted on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script. -| Size | Model | Quantization | Microbatch size | Trainable parameters | GPU | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | -|-------|----------------|--------------|-----------------|----------------------|----------|-------------|--------------------|------------------------------| -| 1.3 B | phi-1.5 | None | 1 | 1,572,864 | 2 x A100 | 4.86 GB | 3.81 min | 190.47 min | -| 1.3 B | phi-1.5 | bnb.nf4 | 1 | 1,572,864 | 2 x A100 | N/A | - | - | -| 1.3 B | phi-1.5 | bnb.nf4-dq | 1 | 1,572,864 | 2 x A100 | N/A | - | - | -| 1.3 B | phi-1.5 | None | 2 | 1,572,864 | 2 x A100 | 5.05 GB | 3.63 min | 181.31 min | -| 1.3 B | phi-1.5 | None | 4 | 1,572,864 | 2 x A100 | 5.88 GB | 3.64 min | 181.76 min | -| | | | | | | | | | -| 3 B | StableLM Alpha | None | 1 | 2,097,152 | 2 x A100 | 12.75 GB | 2.92 min | 145.96 min | -| 3 B | StableLM Alpha | None | 2 | 2,097,152 | 2 x A100 | 12.94 GB | 3.06 min | 153.10 min | -| 3 B | StableLM Alpha | None | 4 | 2,097,152 | 2 x A100 | 13.45 GB | 3.86 min | 192.99 min | -| | | | | | | | - | - | -| 7 B | Llama 2 | None | 1 | 4,194,304 | 2 x A100 | 22.18 GB | 5.93 min | 296.62 min | -| 7 B | Llama 2 | None | 2 | 4,194,304 | 2 x A100 | 22.47 GB | 6.48 min | 324.03 min | -| 7 B | Llama 2 | None | 4 | 4,194,304 | 2 x A100 | 23.39 GB | 8.66 min | 432.82 min | -| | | | | | | | | | -| 13 B | Llama 2 | None | 1 | 6,553,600 | 2 x A100 | OOM | - | - | -| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 2 x A100 | N/A | - | - | -| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 2 x A100 | N/A | - | - | -| | | | | | | | | | -| 13 B | Llama 2 | None | 1 | 6,553,600 | 4 x A100 | 35.57 GB | 10.25 min | 512.5 min | -| 40 B | Falcon | None | 1 | 12,042,240 | 4 x A100 | OOM | - | - | +| Size | Model | Quantization | Microbatch size | Trainable parameters | GPU | Max GPU RAM | Time 1k iterations | +|-------|----------------|--------------|-----------------|----------------------|----------|-------------|--------------------| +| 1.3 B | phi-1.5 | None | 1 | 1,572,864 | 2 x A100 | 4.86 GB | 3.81 min | +| 1.3 B | phi-1.5 | bnb.nf4 | 1 | 1,572,864 | 2 x A100 | N/A | - | +| 1.3 B | phi-1.5 | bnb.nf4-dq | 1 | 1,572,864 | 2 x A100 | N/A | - | +| 1.3 B | phi-1.5 | None | 2 | 1,572,864 | 2 x A100 | 5.05 GB | 3.63 min | +| 1.3 B | phi-1.5 | None | 4 | 1,572,864 | 2 x A100 | 5.88 GB | 3.64 min | +| | | | | | | | | +| 3 B | StableLM Alpha | None | 1 | 2,097,152 | 2 x A100 | 12.75 GB | 2.92 min | +| 3 B | StableLM Alpha | None | 2 | 2,097,152 | 2 x A100 | 12.94 GB | 3.06 min | +| 3 B | StableLM Alpha | None | 4 | 2,097,152 | 2 x A100 | 13.45 GB | 3.86 min | +| | | | | | | | - | +| 7 B | Llama 2 | None | 1 | 4,194,304 | 2 x A100 | 22.18 GB | 5.93 min | +| 7 B | Llama 2 | None | 2 | 4,194,304 | 2 x A100 | 22.47 GB | 6.48 min | +| 7 B | Llama 2 | None | 4 | 4,194,304 | 2 x A100 | 23.39 GB | 8.66 min | +| | | | | | | | | +| 13 B | Llama 2 | None | 1 | 6,553,600 | 2 x A100 | OOM | - | +| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 2 x A100 | N/A | - | +| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 2 x A100 | N/A | - | +| | | | | | | | | +| 13 B | Llama 2 | None | 1 | 6,553,600 | 4 x A100 | 35.57 GB | 10.25 min | +| 40 B | Falcon | None | 1 | 12,042,240 | 4 x A100 | OOM | - |