diff --git a/litgpt/__main__.py b/litgpt/__main__.py index eb20d030e7..12a0d97519 100644 --- a/litgpt/__main__.py +++ b/litgpt/__main__.py @@ -13,7 +13,6 @@ from litgpt.generate.adapter_v2 import main as generate_adapter_v2_fn from litgpt.generate.base import main as generate_base_fn from litgpt.generate.full import main as generate_full_fn -from litgpt.generate.lora import main as generate_lora_fn from litgpt.generate.sequentially import main as generate_sequentially_fn from litgpt.generate.tp import main as generate_tp_fn from litgpt.pretrain import setup as pretrain_fn @@ -55,7 +54,6 @@ def main() -> None: "help": "Generate text samples based on a model and tokenizer.", "base": {"fn": generate_base_fn, "help": "Default generation option."}, "full": {"fn": generate_full_fn, "help": "For models finetuned with `litgpt finetune full`."}, - "lora": {"fn": generate_lora_fn, "help": "For models finetuned with `litgpt finetune lora`."}, "adapter": {"fn": generate_adapter_fn, "help": "For models finetuned with `litgpt finetune adapter`."}, "adapter_v2": { "fn": generate_adapter_v2_fn, diff --git a/litgpt/generate/lora.py b/litgpt/generate/lora.py deleted file mode 100644 index 2fedf4dd69..0000000000 --- a/litgpt/generate/lora.py +++ /dev/null @@ -1,136 +0,0 @@ -# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. - -import sys -import time -from pathlib import Path -from typing import Literal, Optional - -import lightning as L -import torch -from lightning.fabric.plugins import BitsandbytesPrecision - -from litgpt import Tokenizer, PromptStyle -from litgpt.generate.base import generate -from litgpt.lora import GPT, Config, merge_lora_weights -from litgpt.prompts import load_prompt_style, has_prompt_style -from litgpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load - - -def main( - prompt: str = "What food do llamas eat?", - input: str = "", - lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), - quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None, - max_new_tokens: int = 100, - top_k: Optional[int] = 200, - temperature: float = 0.8, - precision: Optional[str] = None, - lora_r: int = 8, - lora_alpha: int = 16, - lora_dropout: float = 0.05, - lora_query: bool = True, - lora_key: bool = False, - lora_value: bool = True, - lora_projection: bool = False, - lora_mlp: bool = False, - lora_head: bool = False, -) -> None: - """Generates a response based on a given instruction and an optional input. This script will only work with - checkpoints from the instruction-tuned LoRA model. See ``litgpt.finetune.lora``. - - Args: - prompt: The prompt/instruction (Alpaca style). - input: Optional input (Alpaca style). - lora_path: Path to the checkpoint with trained adapter weights, which are the output of - ``litgpt.finetune.lora``. - checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights. - quantize: Whether to quantize the model and using which method: - - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes - - bnb.int8: 8-bit quantization from bitsandbytes - for more details, see https://github.com/Lightning-AI/litgpt/blob/main/tutorials/quantize.md - max_new_tokens: The number of generation steps to take. - top_k: The number of top most probable tokens to consider in the sampling process. - temperature: A value controlling the randomness of the sampling process. Higher values result in more random - samples. - precision: Indicates the Fabric precision setting to use. - """ - precision = precision or get_default_supported_precision(training=False) - - plugins = None - if quantize is not None and quantize.startswith("bnb."): - if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") - dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] - plugins = BitsandbytesPrecision(quantize[4:], dtype) - precision = None - - fabric = L.Fabric(devices=1, precision=precision, plugins=plugins) - fabric.launch() - - check_valid_checkpoint_dir(checkpoint_dir) - - config = Config.from_file( - checkpoint_dir / "model_config.yaml", - lora_r=lora_r, - lora_alpha=lora_alpha, - lora_dropout=lora_dropout, - lora_query=lora_query, - lora_key=lora_key, - lora_value=lora_value, - lora_projection=lora_projection, - lora_mlp=lora_mlp, - lora_head=lora_head, - ) - - checkpoint_path = checkpoint_dir / "lit_model.pth" - - tokenizer = Tokenizer(checkpoint_dir) - prompt_style = load_prompt_style(checkpoint_dir) if has_prompt_style(checkpoint_dir) else PromptStyle.from_config(config) - - prompt = prompt_style.apply(prompt, input=input) - encoded = tokenizer.encode(prompt, device=fabric.device) - prompt_length = encoded.size(0) - max_returned_tokens = prompt_length + max_new_tokens - - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) - t0 = time.perf_counter() - with fabric.init_module(empty_init=True): - model = GPT(config) - fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) - with fabric.init_tensor(): - # set the max_seq_length to limit the memory usage to what we need - model.max_seq_length = max_returned_tokens - # enable the kv cache - model.set_kv_cache(batch_size=1) - model.eval() - - t0 = time.perf_counter() - checkpoint = lazy_load(checkpoint_path) - lora_checkpoint = lazy_load(lora_path) - checkpoint.update(lora_checkpoint.get("model", lora_checkpoint)) - model.load_state_dict(checkpoint) - fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) - - merge_lora_weights(model) - model = fabric.setup(model) - - L.seed_everything(1234) - t0 = time.perf_counter() - y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id) - t = time.perf_counter() - t0 - - output = tokenizer.decode(y) - output = output.split("### Response:")[1].strip() - fabric.print(output) - - tokens_generated = y.size(0) - prompt_length - fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr) - if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) - - -if __name__ == "__main__": - torch.set_float32_matmul_precision("high") - - CLI(main) diff --git a/tests/test_generate_lora.py b/tests/test_generate_lora.py deleted file mode 100644 index 02bccdebfb..0000000000 --- a/tests/test_generate_lora.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. - -import os -import subprocess -import sys -from contextlib import redirect_stderr, redirect_stdout -from io import StringIO -from pathlib import Path -from unittest import mock -from unittest.mock import ANY, Mock, call - -import pytest -import torch -import yaml - - -@mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"}) -def test_main(fake_checkpoint_dir, monkeypatch, tensor_like): - import litgpt.generate.lora as generate - - config_path = fake_checkpoint_dir / "model_config.yaml" - config = { - "block_size": 128, - "vocab_size": 50, - "n_layer": 2, - "n_head": 4, - "n_embd": 8, - "rotary_percentage": 1, - "lora_query": False, - "lora_value": False, - "lora_projection": True, - } - config_path.write_text(yaml.dump(config)) - - monkeypatch.setattr(generate, "lazy_load", Mock()) - monkeypatch.setattr(generate.GPT, "load_state_dict", Mock()) - tokenizer_mock = Mock() - tokenizer_mock.return_value.encode.return_value = torch.tensor([[1, 2, 3]]) - tokenizer_mock.return_value.decode.return_value = "### Response:foo bar baz" - monkeypatch.setattr(generate, "Tokenizer", tokenizer_mock) - generate_mock = Mock() - generate_mock.return_value = torch.tensor([[3, 2, 1]]) - monkeypatch.setattr(generate, "generate", generate_mock) - - num_samples = 1 - out, err = StringIO(), StringIO() - with redirect_stdout(out), redirect_stderr(err): - generate.main(temperature=2.0, top_k=2, checkpoint_dir=fake_checkpoint_dir) - - assert len(tokenizer_mock.return_value.decode.mock_calls) == num_samples - assert torch.allclose(tokenizer_mock.return_value.decode.call_args[0][0], generate_mock.return_value) - assert generate_mock.mock_calls == [call(ANY, tensor_like, 101, temperature=2.0, top_k=2, eos_id=ANY)] * num_samples - # only the generated result is printed to stdout - assert out.getvalue() == "foo bar baz\n" * num_samples - - assert "'padded_vocab_size': 512, 'n_layer': 2, 'n_head': 4, 'head_size': 2, 'n_embd': 8" in err.getvalue() - - -@pytest.mark.parametrize("mode", ["file", "entrypoint"]) -def test_cli(mode): - if mode == "file": - cli_path = Path(__file__).parent.parent / "litgpt/generate/lora.py" - args = [sys.executable, cli_path, "-h"] - else: - args = ["litgpt", "generate", "lora", "-h"] - output = subprocess.check_output(args) - output = str(output.decode()) - assert "Generates a response" in output