diff --git a/litgpt/__main__.py b/litgpt/__main__.py
index eb20d030e7..12a0d97519 100644
--- a/litgpt/__main__.py
+++ b/litgpt/__main__.py
@@ -13,7 +13,6 @@
 from litgpt.generate.adapter_v2 import main as generate_adapter_v2_fn
 from litgpt.generate.base import main as generate_base_fn
 from litgpt.generate.full import main as generate_full_fn
-from litgpt.generate.lora import main as generate_lora_fn
 from litgpt.generate.sequentially import main as generate_sequentially_fn
 from litgpt.generate.tp import main as generate_tp_fn
 from litgpt.pretrain import setup as pretrain_fn
@@ -55,7 +54,6 @@ def main() -> None:
             "help": "Generate text samples based on a model and tokenizer.",
             "base": {"fn": generate_base_fn, "help": "Default generation option."},
             "full": {"fn": generate_full_fn, "help": "For models finetuned with `litgpt finetune full`."},
-            "lora": {"fn": generate_lora_fn, "help": "For models finetuned with `litgpt finetune lora`."},
             "adapter": {"fn": generate_adapter_fn, "help": "For models finetuned with `litgpt finetune adapter`."},
             "adapter_v2": {
                 "fn": generate_adapter_v2_fn,
diff --git a/litgpt/generate/lora.py b/litgpt/generate/lora.py
deleted file mode 100644
index 2fedf4dd69..0000000000
--- a/litgpt/generate/lora.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
-
-import sys
-import time
-from pathlib import Path
-from typing import Literal, Optional
-
-import lightning as L
-import torch
-from lightning.fabric.plugins import BitsandbytesPrecision
-
-from litgpt import Tokenizer, PromptStyle
-from litgpt.generate.base import generate
-from litgpt.lora import GPT, Config, merge_lora_weights
-from litgpt.prompts import load_prompt_style, has_prompt_style
-from litgpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load
-
-
-def main(
-    prompt: str = "What food do llamas eat?",
-    input: str = "",
-    lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"),
-    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
-    max_new_tokens: int = 100,
-    top_k: Optional[int] = 200,
-    temperature: float = 0.8,
-    precision: Optional[str] = None,
-    lora_r: int = 8,
-    lora_alpha: int = 16,
-    lora_dropout: float = 0.05,
-    lora_query: bool = True,
-    lora_key: bool = False,
-    lora_value: bool = True,
-    lora_projection: bool = False,
-    lora_mlp: bool = False,
-    lora_head: bool = False,
-) -> None:
-    """Generates a response based on a given instruction and an optional input. This script will only work with
-    checkpoints from the instruction-tuned LoRA model. See ``litgpt.finetune.lora``.
-
-    Args:
-        prompt: The prompt/instruction (Alpaca style).
-        input: Optional input (Alpaca style).
-        lora_path: Path to the checkpoint with trained adapter weights, which are the output of
-            ``litgpt.finetune.lora``.
-        checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights.
-        quantize: Whether to quantize the model and using which method:
-            - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
-            - bnb.int8: 8-bit quantization from bitsandbytes
-            for more details, see https://github.com/Lightning-AI/litgpt/blob/main/tutorials/quantize.md
-        max_new_tokens: The number of generation steps to take.
-        top_k: The number of top most probable tokens to consider in the sampling process.
-        temperature: A value controlling the randomness of the sampling process. Higher values result in more random
-            samples.
-        precision: Indicates the Fabric precision setting to use.
-    """
-    precision = precision or get_default_supported_precision(training=False)
-
-    plugins = None
-    if quantize is not None and quantize.startswith("bnb."):
-        if "mixed" in precision:
-            raise ValueError("Quantization and mixed precision is not supported.")
-        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
-        plugins = BitsandbytesPrecision(quantize[4:], dtype)
-        precision = None
-
-    fabric = L.Fabric(devices=1, precision=precision, plugins=plugins)
-    fabric.launch()
-
-    check_valid_checkpoint_dir(checkpoint_dir)
-
-    config = Config.from_file(
-        checkpoint_dir / "model_config.yaml",
-        lora_r=lora_r,
-        lora_alpha=lora_alpha,
-        lora_dropout=lora_dropout,
-        lora_query=lora_query,
-        lora_key=lora_key,
-        lora_value=lora_value,
-        lora_projection=lora_projection,
-        lora_mlp=lora_mlp,
-        lora_head=lora_head,
-    )
-
-    checkpoint_path = checkpoint_dir / "lit_model.pth"
-
-    tokenizer = Tokenizer(checkpoint_dir)
-    prompt_style = load_prompt_style(checkpoint_dir) if has_prompt_style(checkpoint_dir) else PromptStyle.from_config(config)
-
-    prompt = prompt_style.apply(prompt, input=input)
-    encoded = tokenizer.encode(prompt, device=fabric.device)
-    prompt_length = encoded.size(0)
-    max_returned_tokens = prompt_length + max_new_tokens
-
-    fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
-    t0 = time.perf_counter()
-    with fabric.init_module(empty_init=True):
-        model = GPT(config)
-    fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
-    with fabric.init_tensor():
-        # set the max_seq_length to limit the memory usage to what we need
-        model.max_seq_length = max_returned_tokens
-        # enable the kv cache
-        model.set_kv_cache(batch_size=1)
-    model.eval()
-
-    t0 = time.perf_counter()
-    checkpoint = lazy_load(checkpoint_path)
-    lora_checkpoint = lazy_load(lora_path)
-    checkpoint.update(lora_checkpoint.get("model", lora_checkpoint))
-    model.load_state_dict(checkpoint)
-    fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
-
-    merge_lora_weights(model)
-    model = fabric.setup(model)
-
-    L.seed_everything(1234)
-    t0 = time.perf_counter()
-    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
-    t = time.perf_counter() - t0
-
-    output = tokenizer.decode(y)
-    output = output.split("### Response:")[1].strip()
-    fabric.print(output)
-
-    tokens_generated = y.size(0) - prompt_length
-    fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr)
-    if fabric.device.type == "cuda":
-        fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr)
-
-
-if __name__ == "__main__":
-    torch.set_float32_matmul_precision("high")
-
-    CLI(main)
diff --git a/tests/test_generate_lora.py b/tests/test_generate_lora.py
deleted file mode 100644
index 02bccdebfb..0000000000
--- a/tests/test_generate_lora.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
-
-import os
-import subprocess
-import sys
-from contextlib import redirect_stderr, redirect_stdout
-from io import StringIO
-from pathlib import Path
-from unittest import mock
-from unittest.mock import ANY, Mock, call
-
-import pytest
-import torch
-import yaml
-
-
-@mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"})
-def test_main(fake_checkpoint_dir, monkeypatch, tensor_like):
-    import litgpt.generate.lora as generate
-
-    config_path = fake_checkpoint_dir / "model_config.yaml"
-    config = {
-        "block_size": 128,
-        "vocab_size": 50,
-        "n_layer": 2,
-        "n_head": 4,
-        "n_embd": 8,
-        "rotary_percentage": 1,
-        "lora_query": False,
-        "lora_value": False,
-        "lora_projection": True,
-    }
-    config_path.write_text(yaml.dump(config))
-
-    monkeypatch.setattr(generate, "lazy_load", Mock())
-    monkeypatch.setattr(generate.GPT, "load_state_dict", Mock())
-    tokenizer_mock = Mock()
-    tokenizer_mock.return_value.encode.return_value = torch.tensor([[1, 2, 3]])
-    tokenizer_mock.return_value.decode.return_value = "### Response:foo bar baz"
-    monkeypatch.setattr(generate, "Tokenizer", tokenizer_mock)
-    generate_mock = Mock()
-    generate_mock.return_value = torch.tensor([[3, 2, 1]])
-    monkeypatch.setattr(generate, "generate", generate_mock)
-
-    num_samples = 1
-    out, err = StringIO(), StringIO()
-    with redirect_stdout(out), redirect_stderr(err):
-        generate.main(temperature=2.0, top_k=2, checkpoint_dir=fake_checkpoint_dir)
-
-    assert len(tokenizer_mock.return_value.decode.mock_calls) == num_samples
-    assert torch.allclose(tokenizer_mock.return_value.decode.call_args[0][0], generate_mock.return_value)
-    assert generate_mock.mock_calls == [call(ANY, tensor_like, 101, temperature=2.0, top_k=2, eos_id=ANY)] * num_samples
-    # only the generated result is printed to stdout
-    assert out.getvalue() == "foo bar baz\n" * num_samples
-
-    assert "'padded_vocab_size': 512, 'n_layer': 2, 'n_head': 4, 'head_size': 2, 'n_embd': 8" in err.getvalue()
-
-
-@pytest.mark.parametrize("mode", ["file", "entrypoint"])
-def test_cli(mode):
-    if mode == "file":
-        cli_path = Path(__file__).parent.parent / "litgpt/generate/lora.py"
-        args = [sys.executable, cli_path, "-h"]
-    else:
-        args = ["litgpt", "generate", "lora", "-h"]
-    output = subprocess.check_output(args)
-    output = str(output.decode())
-    assert "Generates a response" in output