From b9ddd8bdd8e759702ddb5b624333f422b4e76b5e Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Thu, 25 Apr 2024 08:39:02 -0500
Subject: [PATCH 01/21] Add precision arg for pretraining (#1353)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 config_hub/pretrain/debug.yaml       | 3 +++
 config_hub/pretrain/tinyllama.yaml   | 3 +++
 config_hub/pretrain/tinystories.yaml | 3 +++
 litgpt/pretrain.py                   | 9 +++++++--
 4 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/config_hub/pretrain/debug.yaml b/config_hub/pretrain/debug.yaml
index 77ad6b13ad..bbe2fee2cc 100644
--- a/config_hub/pretrain/debug.yaml
+++ b/config_hub/pretrain/debug.yaml
@@ -11,6 +11,9 @@ model_config:
 # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
 out_dir: out/pretrain/debug
 
+# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-mixed
+
 # Optional path to a checkpoint directory to initialize the model from.
 # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
 initial_checkpoint_dir:
diff --git a/config_hub/pretrain/tinyllama.yaml b/config_hub/pretrain/tinyllama.yaml
index fe43b8b216..a47bd946f3 100644
--- a/config_hub/pretrain/tinyllama.yaml
+++ b/config_hub/pretrain/tinyllama.yaml
@@ -11,6 +11,9 @@ model_config:
 # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
 out_dir: out/pretrain/tiny-llama
 
+# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-mixed
+
 # Optional path to a checkpoint directory to initialize the model from.
 # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
 initial_checkpoint_dir:
diff --git a/config_hub/pretrain/tinystories.yaml b/config_hub/pretrain/tinystories.yaml
index b6d37209b1..8ef1232862 100644
--- a/config_hub/pretrain/tinystories.yaml
+++ b/config_hub/pretrain/tinystories.yaml
@@ -27,6 +27,9 @@ model_config:
 # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
 out_dir: out/pretrain/stories15M
 
+# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-mixed
+
 # Optional path to a checkpoint directory to initialize the model from.
 # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
 initial_checkpoint_dir:
diff --git a/litgpt/pretrain.py b/litgpt/pretrain.py
index 152e574cfd..3a763116a0 100644
--- a/litgpt/pretrain.py
+++ b/litgpt/pretrain.py
@@ -29,6 +29,7 @@
     choose_logger,
     chunked_cross_entropy,
     copy_config_files,
+    get_default_supported_precision,
     init_out_dir,
     num_parameters,
     parse_devices,
@@ -42,6 +43,7 @@ def setup(
     model_name: Optional[str] = None,
     model_config: Optional[Config] = None,
     out_dir: Path = Path("out/pretrain"),
+    precision: Literal["bf16-true", "bf16-mixed", "32-true", None] = None,
     initial_checkpoint_dir: Optional[Path] = None,
     resume: Union[bool, Path] = False,
     data: Optional[DataModule] = None,
@@ -75,6 +77,7 @@ def setup(
             ``model_config``.
         out_dir: Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
             /teamspace/jobs/<job-name>/share.
+        precision: The precision to use for finetuning. Determines a compatible precision setting by default.
         initial_checkpoint_dir: Optional path to a checkpoint directory to initialize the model from.
             Useful for continued pretraining. Mutually exclusive with ``resume``.
         resume: Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
@@ -96,6 +99,7 @@ def setup(
         available_models = "\n".join(sorted(name_to_config))
         raise ValueError(f"Please specify --model_name <model_name>. Available values:\n{available_models}")
     config = Config.from_name(model_name) if model_config is None else model_config
+    precision = precision or get_default_supported_precision(training=True)
     devices = parse_devices(devices)
     out_dir = init_out_dir(out_dir)
     # in case the dataset requires the Tokenizer
@@ -109,7 +113,7 @@ def setup(
         strategy = FSDPStrategy(auto_wrap_policy={Block}, state_dict_type="full", sharding_strategy="HYBRID_SHARD")
     else:
         strategy = "auto"
-    fabric = L.Fabric(devices=devices, strategy=strategy, precision="bf16-mixed", loggers=[logger])
+    fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=[logger])
     fabric.launch()
 
     fabric.print(pprint.pformat(hparams))
@@ -169,12 +173,13 @@ def main(
 
     model = torch.compile(model)
     model = fabric.setup(model)
+
     optimizer = torch.optim.AdamW(
         model.parameters(),
         lr=train.learning_rate,
         weight_decay=train.weight_decay,
         betas=(train.beta1, train.beta2),
-        fused=True,
+        fused=fabric.device.type == "cuda",
     )
     optimizer = fabric.setup_optimizers(optimizer)
 

From 1d69eac021ae9f1df8ff427f5eec8e4dabaeb306 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Thu, 25 Apr 2024 13:57:25 -0500
Subject: [PATCH 02/21] Update litserve dependency (#1356)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 litgpt/deploy/serve.py | 15 +++++++++++++--
 pyproject.toml         |  4 ++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/litgpt/deploy/serve.py b/litgpt/deploy/serve.py
index 4a26e0b14f..9cd594230d 100644
--- a/litgpt/deploy/serve.py
+++ b/litgpt/deploy/serve.py
@@ -1,11 +1,12 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 from pathlib import Path
-from typing import Dict, Any, Optional, Literal
+from typing import Dict, Any, Optional
 from litgpt.utils import check_valid_checkpoint_dir
 
 import lightning as L
+from lightning_utilities.core.imports import RequirementCache 
 import torch
-from litserve import LitAPI, LitServer
+
 
 from litgpt.model import GPT
 from litgpt.config import Config
@@ -15,6 +16,13 @@
 from litgpt.utils import load_checkpoint, CLI, get_default_supported_precision
 
 
+_LITSERVE_AVAILABLE = RequirementCache("litserve")
+if _LITSERVE_AVAILABLE:
+    from litserve import LitAPI, LitServer
+else:
+    LitAPI, LitServer = object, object
+
+
 class SimpleLitAPI(LitAPI):
     def __init__(self,
                  checkpoint_dir: Path,
@@ -23,6 +31,9 @@ def __init__(self,
                  top_k: int = 50,
                  max_new_tokens: int = 50) -> None:
 
+        if not _LITSERVE_AVAILABLE:
+            raise ImportError(str(_LITSERVE_AVAILABLE))
+
         super().__init__()
         self.checkpoint_dir = checkpoint_dir
         self.precision = precision
diff --git a/pyproject.toml b/pyproject.toml
index d8d60ff594..ba3bc7c9e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,8 +11,7 @@ license = { file = "LICENSE" }
 dependencies = [
     "torch>=2.2.0",
     "lightning==2.3.0.dev20240328",
-    "jsonargparse[signatures]>=4.27.6",
-    "litserve>=0.1.0" # imported by litgpt.deploy
+    "jsonargparse[signatures]>=4.27.6"
 ]
 
 [project.urls]
@@ -38,6 +37,7 @@ all = [
     "tokenizers>=0.15.2",        # pythia, falcon, redpajama
     "requests>=2.31.0",          # litgpt.data
     "litdata>=0.2.2",            # litgpt.data
+    "litserve>=0.1.0",           # litgpt.deploy
     "zstandard>=0.22.0",         # litgpt.data.prepare_slimpajama.py
     "pandas>=1.9.0",             # litgpt.data.prepare_starcoder.py
     "pyarrow>=15.0.2",           # litgpt.data.prepare_starcoder.py

From bfe97c7facf184377181e1060365d13a0bb5b228 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Fri, 26 Apr 2024 15:01:55 +0200
Subject: [PATCH 03/21] Fix `litgpt evaluate` not using the local checkpoint
 (#1357)

---
 litgpt/eval/evaluate.py | 29 +++++++++++++----------------
 litgpt/utils.py         |  2 +-
 tests/test_evaluate.py  | 37 ++++++++++++++-----------------------
 3 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/litgpt/eval/evaluate.py b/litgpt/eval/evaluate.py
index 78e0ed0f59..29791630dd 100644
--- a/litgpt/eval/evaluate.py
+++ b/litgpt/eval/evaluate.py
@@ -4,7 +4,6 @@
 import os
 from pathlib import Path
 from typing import Optional, Union
-import yaml
 import torch
 
 from litgpt.scripts.convert_lit_checkpoint import convert_lit_checkpoint
@@ -28,7 +27,7 @@ def prepare_results(results, save_filepath, print_results=True):
 def convert_and_evaluate(
     checkpoint_dir: Path,
     tasks: Optional[str] = None,
-    out_dir: Optional[str] = None,
+    out_dir: Optional[Path] = None,
     force_conversion: bool = False,
     num_fewshot: Optional[int] = None,
     batch_size: int = 1,
@@ -36,7 +35,7 @@ def convert_and_evaluate(
     dtype: Optional[Union[str, torch.dtype]] = None,
     limit: Optional[float] = None,
     seed: int = 1234,
-    save_filepath: Optional[str] = None,
+    save_filepath: Optional[Path] = None,
 ) -> None:
     """Convert a LitGPT model and run the LM Evaluation Harness
 
@@ -46,9 +45,7 @@ def convert_and_evaluate(
             Saves to `checkpoint_dir`/evaluate by default.
         force_conversion: Set to `True` to reconvert the model and override
             an existing model.pth from a previous evaluation call.
-        tasks: CSV of task names to evaluate.
-           By default, the following tasks are used:
-           "hellaswag,truthfulqa_mc2,mmlu"
+        tasks: CSV of task names to evaluate. Example: "hellaswag,truthfulqa_mc2,mmlu"
         num_fewshot: Number of examples in few-shot context.
         batch_size: Batch size configuration.
         device: Device to use for evaluation, for example, "cuda" or "cuda:0".
@@ -84,22 +81,22 @@ def convert_and_evaluate(
     out_dir.mkdir(parents=True, exist_ok=True)
 
     save_filepath = out_dir / Path("results.json") if save_filepath is None else Path(save_filepath)
-    config_filepath = checkpoint_dir/"model_config.yaml"
 
-    with open(config_filepath, encoding="utf-8") as f:
-        config_dict = yaml.safe_load(f)
-    repo_id = f"{config_dict['hf_config']['org']}/{config_dict['hf_config']['name']}"
-
-    copy_config_files(source_dir=checkpoint_dir, out_dir=out_dir)
-
-    model_path = out_dir / "model.pth"
+    model_path = out_dir / "pytorch_model.bin"
     if not model_path.exists() or force_conversion:
+        copy_config_files(source_dir=checkpoint_dir, out_dir=out_dir)
         convert_lit_checkpoint(checkpoint_dir=checkpoint_dir, output_dir=out_dir)
+    
+        # Hack: LitGPT's conversion doesn't save a pickle file that is compatible to be loaded with
+        # `torch.load(..., weights_only=True)`, which is a requirement in HFLM.
+        # So we're `torch.load`-ing and `torch.sav`-ing it again to work around this.
+        state_dict = torch.load(out_dir / "model.pth")
+        torch.save(state_dict, model_path)
+        os.remove(out_dir / "model.pth")
 
     from lm_eval.models.huggingface import HFLM
 
-    state_dict = torch.load(model_path)
-    model = HFLM(repo_id, state_dict=state_dict, device=device, batch_size=batch_size, dtype=dtype)
+    model = HFLM(pretrained=str(out_dir.resolve()), device=device, batch_size=batch_size, dtype=dtype)
 
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
diff --git a/litgpt/utils.py b/litgpt/utils.py
index 8a64b94110..21f7f34a98 100644
--- a/litgpt/utils.py
+++ b/litgpt/utils.py
@@ -392,7 +392,7 @@ def __iter__(self) -> Self:
 def copy_config_files(source_dir: Path, out_dir: Path) -> None:
     """Copies the specified configuration and tokenizer files into the output directory."""
 
-    config_files = ["generation_config.json", "model_config.yaml"]
+    config_files = ["config.json", "generation_config.json", "model_config.yaml"]
     tokenizer_files = ["tokenizer.json", "tokenizer.model", "tokenizer_config.json"]
 
     for file_name in config_files + tokenizer_files:
diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 0cbbcc83de..12f8a68f9c 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -1,6 +1,5 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 
-import shutil
 import subprocess
 import sys
 from contextlib import redirect_stdout
@@ -9,7 +8,6 @@
 from pathlib import Path
 from unittest import mock
 
-import datasets
 import pytest
 import torch
 import yaml
@@ -19,37 +17,30 @@
 from litgpt.scripts.download import download_from_hub
 
 
-@pytest.mark.xfail(
-    raises=(datasets.builder.DatasetGenerationError, NotImplementedError),
-    strict=False,
-    match="Loading a dataset cached in a LocalFileSystem is not supported",
-)
-def test_evaluate_script(tmp_path, monkeypatch):
+def test_evaluate_script(tmp_path):
     ours_config = Config.from_name("pythia-14m")
     download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path)
-    shutil.move(str(tmp_path / "EleutherAI" / "pythia-14m" / "tokenizer.json"), str(tmp_path))
-    shutil.move(str(tmp_path / "EleutherAI" / "pythia-14m" / "tokenizer_config.json"), str(tmp_path))
+    checkpoint_dir = tmp_path / "EleutherAI" / "pythia-14m"
     ours_model = GPT(ours_config)
-    checkpoint_path = tmp_path / "lit_model.pth"
-    torch.save(ours_model.state_dict(), checkpoint_path)
-    config_path = tmp_path / "model_config.yaml"
-    with open(config_path, "w", encoding="utf-8") as fp:
+    torch.save(ours_model.state_dict(), checkpoint_dir / "lit_model.pth")
+    with open( checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp:
         yaml.dump(asdict(ours_config), fp)
 
-    fn_kwargs = dict(
-        checkpoint_dir=tmp_path,
-        out_dir=tmp_path / "out_dir",
-        device=None,
-        dtype=torch.float32,
-        limit=5,
-        tasks="mathqa"
-    )
     stdout = StringIO()
     with redirect_stdout(stdout), mock.patch("sys.argv", ["eval/evaluate.py"]):
-        module.convert_and_evaluate(**fn_kwargs)
+        module.convert_and_evaluate(
+            checkpoint_dir=checkpoint_dir,
+            out_dir=tmp_path / "out_dir",
+            device=None,
+            dtype=torch.float32,
+            limit=5,
+            tasks="mathqa"
+        )
     stdout = stdout.getvalue()
+    assert (tmp_path / "out_dir" / "results.json").is_file()
     assert "mathqa" in stdout
     assert "Metric" in stdout
+    assert "Loading checkpoint shards" not in stdout
 
 
 @pytest.mark.parametrize("mode", ["file", "entrypoint"])

From 5895df1004c3d05ef69f7aeffcfee757dbc42d58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Fri, 26 Apr 2024 18:10:17 +0200
Subject: [PATCH 04/21] Update table with new benchmark results (#1361)

Co-authored-by: Sebastian Raschka <mail@sebastianraschka.com>
---
 config_hub/finetune/README.md | 115 +++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 57 deletions(-)

diff --git a/config_hub/finetune/README.md b/config_hub/finetune/README.md
index 9892cb339f..6e78528c52 100644
--- a/config_hub/finetune/README.md
+++ b/config_hub/finetune/README.md
@@ -3,66 +3,67 @@
 The table below lists the performances you can expect from the provided config files. Note that you can achieve lower memory consumption by lowering the micro batch size as needed. In addition, you can lower the rank (`lora_r`) in the LoRA configuration files and disable LoRA for certain layers (for example, setting `lora_projection` and other LoRA layer-specific parameters to `false`).
 For more information, see the [Dealing with out-of-memory (OOM) errors](../../tutorials/oom.md) on lowering the memory requirements.
 The "Cost" column refers to the on-demand compute cost on [Lightning AI](https://lightning.ai) where these benchmarks were executed.
+All experiments were conducted using bfloat-16 precision on the Alpaca2k dataset. The "Multitask score" refers to [MMLU](https://arxiv.org/abs/2009.03300).
 
 &nbsp;
 
-| Config                            | Model                  | Dataset  | Epochs | Max seq length | Micro batch size | Precision | Machine | Training runtime | Cost | Peak memory | Validation loss | Validation perplexity |
-| --------------------------------- | ---------------------- | -------- | ------ | -------------- | ---------------- | --------- | ------- | ---------------- | ---- | ----------- | --------------- | --------------------- |
-| falcon-7b/lora.yaml               | falcon-7b              | Alpaca2k | 4      | 512            | 1                | bf16-true | 1xA10G  | 24.84 min        | $0.7 | 16.69 GB    | 0.945           | 2.573                 |
-| falcon-7b/lora.yaml               | falcon-7b              | Alpaca2k | 4      | 512            | 1                | bf16-true | 4xA10G  | 24.94 min        | $2.0 | 16.69 GB    | 0.945           | 2.573                 |
-| falcon-7b/qlora.yaml              | falcon-7b              | Alpaca2k | 4      | 512            | 1                | bf16-true | 1xA10G  | 50.85 min        | $1.5 | 9.44 GB     | 0.993           | 2.699                 |
-| falcon-7b/qlora.yaml              | falcon-7b              | Alpaca2k | 4      | 512            | 1                | bf16-true | 4xA10G  | 50.88 min        | $4.1 | 9.44 GB     | 0.993           | 2.699                 |
-|                                   |                        |          |        |                |                  |           |         |                  |      |             |                 |                       |
-| gemma-2b/full.yaml                | gemma-2b               | Alpaca2k | 1      | 512            | 1                | bf16-true | 4xA10G  | 14.06 min        | $1.1 | 17.43 GB    | 1.021           | 2.777                 |
-| gemma-2b/lora.yaml                | gemma-2b               | Alpaca2k | 2      | 512            | 2                | bf16-true | 1xA10G  | 9.41 min         | $0.3 | 12.62 GB    | 0.981           | 2.666                 |
-| gemma-2b/lora.yaml                | gemma-2b               | Alpaca2k | 2      | 512            | 2                | bf16-true | 4xA10G  | 9.41 min         | $0.8 | 12.62 GB    | 0.981           | 2.667                 |
-| gemma-2b/qlora.yaml               | gemma-2b               | Alpaca2k | 2      | 512            | 2                | bf16-true | 1xA10G  | 12.91 min        | $0.4 | 11.58 GB    | 1.085           | 2.959                 |
-| gemma-2b/qlora.yaml               | gemma-2b               | Alpaca2k | 2      | 512            | 2                | bf16-true | 4xA10G  | 12.91 min        | $1.0 | 11.59 GB    | 1.085           | 2.958                 |
-|                                   |                        |          |        |                |                  |           |         |                  |      |             |                 |                       |
-| gemma-7b/lora.yaml                | gemma-7b               | Alpaca2k | 2      | 512            | 1                | bf16-true | 1xA10G  | OOM              | OOM  | OOM         | OOM             | OOM                   |
-| gemma-7b/lora.yaml                | gemma-7b               | Alpaca2k | 2      | 512            | 1                | bf16-true | 4xA10G  | OOM              | OOM  | OOM         | OOM             | OOM                   |
-| gemma-7b/qlora.yaml               | gemma-7b               | Alpaca2k | 2      | 512            | 1                | bf16-true | 1xA10G  | 43.58 min        | $1.3 | 17.18 GB    | 0.973           | 2.646                 |
-| gemma-7b/qlora.yaml               | gemma-7b               | Alpaca2k | 2      | 512            | 1                | bf16-true | 4xA10G  | 43.58 min        | $3.5 | 17.18 GB    | 0.983           | 2.672                 |
-|                                   |                        |          |        |                |                  |           |         |                  |      |             |                 |                       |
-| llama-2-7b/full.yaml              | llama-2-7b             | Alpaca2k | 1      | 512            | 4                | bf16-true | 4xA10G  | OOM              | OOM  | OOM         | OOM             | OOM                   |
-| llama-2-7b/lora.yaml              | llama-2-7b             | Alpaca2k | 4      | 512            | 2                | bf16-true | 1xA10G  | 32.82 min        | $1.0 | 19.77 GB    | 0.802           | 2.230                 |
-| llama-2-7b/lora.yaml              | llama-2-7b             | Alpaca2k | 4      | 512            | 2                | bf16-true | 4xA10G  | 32.83 min        | $2.6 | 19.77 GB    | 0.802           | 2.229                 |
-| llama-2-7b/qlora.yaml             | llama-2-7b             | Alpaca2k | 4      | 512            | 2                | bf16-true | 1xA10G  | 45.67 min        | $1.4 | 13.68 GB    | 0.814           | 2.258                 |
-| llama-2-7b/qlora.yaml             | llama-2-7b             | Alpaca2k | 4      | 512            | 2                | bf16-true | 4xA10G  | 45.69 min        | $3.7 | 13.68 GB    | 0.815           | 2.258                 |
-|                                   |                        |          |        |                |                  |           |         |                  |      |             |                 |                       |
-| llama-3-8b/full.yaml              | llama-3-8b             | Alpaca2k | 1      | 512            | 4                | bf16-true | 4xA10G  | OOM              | OOM  | OOM         | OOM             | OOM                   |
-| llama-3-8b/lora.yaml              | llama-3-8b             | Alpaca2k | 2      | 512            | 1                | bf16-true | 1xA10G  | 14.79 min        | $0.4 | 19.73 GB    | 0.888           | 2.431                 |
-| llama-3-8b/lora.yaml              | llama-3-8b             | Alpaca2k | 2      | 512            | 1                | bf16-true | 4xA10G  | 14.88 min        | $1.2 | 19.73 GB    | 0.889           | 2.432                 |
-| llama-3-8b/qlora.yaml             | llama-3-8b             | Alpaca2k | 2      | 512            | 2                | bf16-true | 1xA10G  | 22.24 min        | $0.7 | 17.41 GB    | 0.939           | 2.558                 |
-| llama-3-8b/qlora.yaml             | llama-3-8b             | Alpaca2k | 2      | 512            | 2                | bf16-true | 4xA10G  | 22.20 min        | $1.8 | 17.41 GB    | 0.939           | 2.557                 |
-|                                   |                        |          |        |                |                  |           |         |                  |      |             |                 |                       |
-| mistral-7b-v0.2/lora.yaml         | mistral-7b-v0.2        | Alpaca2k | 4      | 512            | 2                | bf16-true | 1xA10G  | 31.00 min        | $0.9 | 20.66 GB    | 0.801           | 2.228                 |
-| mistral-7b-v0.2/lora.yaml         | mistral-7b-v0.2        | Alpaca2k | 4      | 512            | 2                | bf16-true | 4xA10G  | 31.00 min        | $2.5 | 20.66 GB    | 0.802           | 2.229                 |
-| mistral-7b-v0.2/qlora.yaml        | mistral-7b-v0.2        | Alpaca2k | 4      | 512            | 2                | bf16-true | 1xA10G  | 44.75 min        | $1.3 | 14.29 GB    | 0.813           | 2.255                 |
-| mistral-7b-v0.2/qlora.yaml        | mistral-7b-v0.2        | Alpaca2k | 4      | 512            | 2                | bf16-true | 4xA10G  | 44.75 min        | $3.6 | 14.29 GB    | 0.813           | 2.254                 |
-|                                   |                        |          |        |                |                  |           |         |                  |      |             |                 |                       |
-| mistral-7b/lora.yaml              | mistral-7b             | Alpaca2k | 4      | 512            | 2                | bf16-true | 1xA10G  | 31.01 min        | $0.9 | 20.66 GB    | 0.794           | 2.211                 |
-| mistral-7b/lora.yaml              | mistral-7b             | Alpaca2k | 4      | 512            | 2                | bf16-true | 4xA10G  | 31.03 min        | $2.5 | 20.66 GB    | 0.796           | 2.218                 |
-| mistral-7b/qlora.yaml             | mistral-7b             | Alpaca2k | 4      | 512            | 2                | bf16-true | 1xA10G  | 44.75 min        | $1.3 | 14.29 GB    | 0.803           | 2.231                 |
-| mistral-7b/qlora.yaml             | mistral-7b             | Alpaca2k | 4      | 512            | 2                | bf16-true | 4xA10G  | 44.81 min        | $3.6 | 14.29 GB    | 0.803           | 2.233                 |
-|                                   |                        |          |        |                |                  |           |         |                  |      |             |                 |                       |
-| phi-2/full.yaml                   | phi-2                  | Alpaca2k | 1      | 512            | 4                | bf16-true | 4xA10G  | 11.87 min        | $1.0 | 14.44 GB    | 1.305           | 3.688                 |
-| phi-2/lora.yaml                   | phi-2                  | Alpaca2k | 1      | 512            | 4                | bf16-true | 1xA10G  | 3.78 min         | $0.1 | 13.98 GB    | 0.819           | 2.269                 |
-| phi-2/lora.yaml                   | phi-2                  | Alpaca2k | 1      | 512            | 4                | bf16-true | 4xA10G  | 3.78 min         | $0.3 | 13.98 GB    | 0.820           | 2.271                 |
-| phi-2/qlora.yaml                  | phi-2                  | Alpaca2k | 1      | 512            | 4                | bf16-true | 1xA10G  | 4.51 min         | $0.1 | 14.27 GB    | 0.837           | 2.310                 |
-| phi-2/qlora.yaml                  | phi-2                  | Alpaca2k | 1      | 512            | 4                | bf16-true | 4xA10G  | 4.52 min         | $0.4 | 14.27 GB    | 0.837           | 2.309                 |
-|                                   |                        |          |        |                |                  |           |         |                  |      |             |                 |                       |
-| stablelm-base-alpha-3b/full.yaml  | stablelm-base-alpha-3b | Alpaca2k | 1      | 512            | 1                | bf16-true | 4xA10G  | 70.13 min        | $5.6 | 21.23 GB    | 1.513           | 4.540                 |
-| stablelm-base-alpha-3b/lora.yaml  | stablelm-base-alpha-3b | Alpaca2k | 4      | 512            | 1                | bf16-true | 1xA10G  | 13.07 min        | $0.4 | 8.58 GB     | 1.361           | 3.900                 |
-| stablelm-base-alpha-3b/lora.yaml  | stablelm-base-alpha-3b | Alpaca2k | 4      | 512            | 1                | bf16-true | 4xA10G  | 13.16 min        | $1.1 | 8.58 GB     | 1.362           | 3.906                 |
-| stablelm-base-alpha-3b/qlora.yaml | stablelm-base-alpha-3b | Alpaca2k | 4      | 512            | 1                | bf16-true | 1xA10G  | 25.86 min        | $0.8 | 5.24 GB     | 1.388           | 4.009                 |
-| stablelm-base-alpha-3b/qlora.yaml | stablelm-base-alpha-3b | Alpaca2k | 4      | 512            | 1                | bf16-true | 4xA10G  | 25.80 min        | $2.1 | 5.24 GB     | 1.391           | 4.020                 |
-|                                   |                        |          |        |                |                  |           |         |                  |      |             |                 |                       |
-| tiny-llama/full.yaml              | tiny-llama             | Alpaca2k | 1      | 512            | 4                | bf16-true | 1xA10G  | 2.58 min         | $0.1 | 14.10 GB    | 1.088           | 2.968                 |
-| tiny-llama/full.yaml              | tiny-llama             | Alpaca2k | 1      | 512            | 4                | bf16-true | 4xA10G  | 2.57 min         | $0.2 | 14.10 GB    | 1.088           | 2.968                 |
-| tiny-llama/lora.yaml              | tiny-llama             | Alpaca2k | 3      | 512            | 8                | bf16-true | 1xA10G  | 8.09 min         | $0.2 | 13.50 GB    | 1.039           | 2.826                 |
-| tiny-llama/qlora.yaml             | tiny-llama             | Alpaca2k | 3      | 512            | 8                | bf16-true | 1xA10G  | 8.70 min         | $0.3 | 16.24 GB    | 1.056           | 2.874                 |
-| tiny-llama/qlora.yaml             | tiny-llama             | Alpaca2k | 3      | 512            | 8                | bf16-true | 4xA10G  | 8.70 min         | $0.7 | 16.24 GB    | 1.056           | 2.874                 |
+| Config                            | Model                  | Epochs | Max seq length | Micro batch size | Machine | Training runtime | Cost | Peak memory | Validation loss | Validation perplexity | Multitask score (MMLU) |
+| --------------------------------- | ---------------------- | ------ | -------------- | ---------------- | ------- | ---------------- | ---- | ----------- | --------------- | --------------------- | --------------- |
+| falcon-7b/lora.yaml               | falcon-7b              | 4      | 512            | 1                | 1xA10G  | 24.84 min        | $0.7 | 16.69 GB    | 0.945           | 2.573                 | 26.2%           |
+| falcon-7b/lora.yaml               | falcon-7b              | 4      | 512            | 1                | 4xA10G  | 24.94 min        | $2.0 | 16.69 GB    | 0.945           | 2.573                 | 26.4%           |
+| falcon-7b/qlora.yaml              | falcon-7b              | 4      | 512            | 1                | 1xA10G  | 50.85 min        | $1.5 | 9.44 GB     | 0.993           | 2.699                 | 26.3%           |
+| falcon-7b/qlora.yaml              | falcon-7b              | 4      | 512            | 1                | 4xA10G  | 50.88 min        | $4.1 | 9.44 GB     | 0.993           | 2.699                 | 26.3%           |
+|                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
+| gemma-2b/full.yaml                | gemma-2b               | 1      | 512            | 1                | 4xA10G  | 14.06 min        | $1.1 | 17.43 GB    | 1.021           | 2.777                 | 32.4%           |
+| gemma-2b/lora.yaml                | gemma-2b               | 2      | 512            | 2                | 1xA10G  | 9.41 min         | $0.3 | 12.62 GB    | 0.981           | 2.666                 | 34.4%           |
+| gemma-2b/lora.yaml                | gemma-2b               | 2      | 512            | 2                | 4xA10G  | 9.41 min         | $0.8 | 12.62 GB    | 0.981           | 2.667                 | 34.0%           |
+| gemma-2b/qlora.yaml               | gemma-2b               | 2      | 512            | 2                | 1xA10G  | 12.91 min        | $0.4 | 11.58 GB    | 1.085           | 2.959                 | 36.4%           |
+| gemma-2b/qlora.yaml               | gemma-2b               | 2      | 512            | 2                | 4xA10G  | 12.91 min        | $1.0 | 11.59 GB    | 1.085           | 2.958                 | 36.4%           |
+|                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
+| gemma-7b/lora.yaml                | gemma-7b               | 2      | 512            | 1                | 1xA10G  | OOM              | OOM  | OOM         | OOM             | OOM                   |                 |
+| gemma-7b/lora.yaml                | gemma-7b               | 2      | 512            | 1                | 4xA10G  | OOM              | OOM  | OOM         | OOM             | OOM                   |                 |
+| gemma-7b/qlora.yaml               | gemma-7b               | 2      | 512            | 1                | 1xA10G  | 43.58 min        | $1.3 | 17.18 GB    | 0.973           | 2.646                 |                 |
+| gemma-7b/qlora.yaml               | gemma-7b               | 2      | 512            | 1                | 4xA10G  | 43.58 min        | $3.5 | 17.18 GB    | 0.983           | 2.672                 |                 |
+|                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
+| llama-2-7b/full.yaml              | llama-2-7b             | 1      | 512            | 4                | 4xA10G  | OOM              | OOM  | OOM         | OOM             | OOM                   |                 |
+| llama-2-7b/lora.yaml              | llama-2-7b             | 4      | 512            | 2                | 1xA10G  | 32.82 min        | $1.0 | 19.77 GB    | 0.802           | 2.230                 | 40.3%           |
+| llama-2-7b/lora.yaml              | llama-2-7b             | 4      | 512            | 2                | 4xA10G  | 32.83 min        | $2.6 | 19.77 GB    | 0.802           | 2.229                 | 40.2%           |
+| llama-2-7b/qlora.yaml             | llama-2-7b             | 4      | 512            | 2                | 1xA10G  | 45.67 min        | $1.4 | 13.68 GB    | 0.814           | 2.258                 | 38.6%           |
+| llama-2-7b/qlora.yaml             | llama-2-7b             | 4      | 512            | 2                | 4xA10G  | 45.69 min        | $3.7 | 13.68 GB    | 0.815           | 2.258                 | 38.6%           |
+|                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
+| llama-3-8b/full.yaml              | llama-3-8b             | 1      | 512            | 4                | 4xA10G  | OOM              | OOM  | OOM         | OOM             | OOM                   |                 |
+| llama-3-8b/lora.yaml              | llama-3-8b             | 2      | 512            | 1                | 1xA10G  | 14.79 min        | $0.4 | 19.73 GB    | 0.888           | 2.431                 | 62.4%           |
+| llama-3-8b/lora.yaml              | llama-3-8b             | 2      | 512            | 1                | 4xA10G  | 14.88 min        | $1.2 | 19.73 GB    | 0.889           | 2.432                 | 62.5%           |
+| llama-3-8b/qlora.yaml             | llama-3-8b             | 2      | 512            | 2                | 1xA10G  | 22.24 min        | $0.7 | 17.41 GB    | 0.939           | 2.558                 | 62.2%           |
+| llama-3-8b/qlora.yaml             | llama-3-8b             | 2      | 512            | 2                | 4xA10G  | 22.20 min        | $1.8 | 17.41 GB    | 0.939           | 2.557                 | 62.2%           |
+|                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
+| mistral-7b-v0.2/lora.yaml         | mistral-7b-v0.2        | 4      | 512            | 2                | 1xA10G  | 31.00 min        | $0.9 | 20.66 GB    | 0.801           | 2.228                 | 55.7%           |
+| mistral-7b-v0.2/lora.yaml         | mistral-7b-v0.2        | 4      | 512            | 2                | 4xA10G  | 31.00 min        | $2.5 | 20.66 GB    | 0.802           | 2.229                 | 55.5%           |
+| mistral-7b-v0.2/qlora.yaml        | mistral-7b-v0.2        | 4      | 512            | 2                | 1xA10G  | 44.75 min        | $1.3 | 14.29 GB    | 0.813           | 2.255                 | 56.5%           |
+| mistral-7b-v0.2/qlora.yaml        | mistral-7b-v0.2        | 4      | 512            | 2                | 4xA10G  | 44.75 min        | $3.6 | 14.29 GB    | 0.813           | 2.254                 | 56.3%           |
+|                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
+| mistral-7b/lora.yaml              | mistral-7b             | 4      | 512            | 2                | 1xA10G  | 31.01 min        | $0.9 | 20.66 GB    | 0.794           | 2.211                 | 57.9%           |
+| mistral-7b/lora.yaml              | mistral-7b             | 4      | 512            | 2                | 4xA10G  | 31.03 min        | $2.5 | 20.66 GB    | 0.796           | 2.218                 | 57.9%           |
+| mistral-7b/qlora.yaml             | mistral-7b             | 4      | 512            | 2                | 1xA10G  | 44.75 min        | $1.3 | 14.29 GB    | 0.803           | 2.231                 | 57.9%           |
+| mistral-7b/qlora.yaml             | mistral-7b             | 4      | 512            | 2                | 4xA10G  | 44.81 min        | $3.6 | 14.29 GB    | 0.803           | 2.233                 | 57.6%           |
+|                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
+| phi-2/full.yaml                   | phi-2                  | 1      | 512            | 4                | 4xA10G  | 11.87 min        | $1.0 | 14.44 GB    | 1.305           | 3.688                 | 38.4%           |
+| phi-2/lora.yaml                   | phi-2                  | 1      | 512            | 4                | 1xA10G  | 3.78 min         | $0.1 | 13.98 GB    | 0.819           | 2.269                 | 53.0%           |
+| phi-2/lora.yaml                   | phi-2                  | 1      | 512            | 4                | 4xA10G  | 3.78 min         | $0.3 | 13.98 GB    | 0.820           | 2.271                 | 52.4%           |
+| phi-2/qlora.yaml                  | phi-2                  | 1      | 512            | 4                | 1xA10G  | 4.51 min         | $0.1 | 14.27 GB    | 0.837           | 2.310                 | 52.3%           |
+| phi-2/qlora.yaml                  | phi-2                  | 1      | 512            | 4                | 4xA10G  | 4.52 min         | $0.4 | 14.27 GB    | 0.837           | 2.309                 | 52.3%           |
+|                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
+| stablelm-base-alpha-3b/full.yaml  | stablelm-base-alpha-3b | 1      | 512            | 1                | 4xA10G  | 70.13 min        | $5.6 | 21.23 GB    | 1.513           | 4.540                 | 23.2%           |
+| stablelm-base-alpha-3b/lora.yaml  | stablelm-base-alpha-3b | 4      | 512            | 1                | 1xA10G  | 13.07 min        | $0.4 | 8.58 GB     | 1.361           | 3.900                 | 25.9%           |
+| stablelm-base-alpha-3b/lora.yaml  | stablelm-base-alpha-3b | 4      | 512            | 1                | 4xA10G  | 13.16 min        | $1.1 | 8.58 GB     | 1.362           | 3.906                 | 25.9%           |
+| stablelm-base-alpha-3b/qlora.yaml | stablelm-base-alpha-3b | 4      | 512            | 1                | 1xA10G  | 25.86 min        | $0.8 | 5.24 GB     | 1.388           | 4.009                 | 26.1%           |
+| stablelm-base-alpha-3b/qlora.yaml | stablelm-base-alpha-3b | 4      | 512            | 1                | 4xA10G  | 25.80 min        | $2.1 | 5.24 GB     | 1.391           | 4.020                 | 26.6%           |
+|                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
+| tiny-llama/full.yaml              | tiny-llama             | 1      | 512            | 4                | 1xA10G  | 2.58 min         | $0.1 | 14.10 GB    | 1.088           | 2.968                 | 24.6%           |
+| tiny-llama/full.yaml              | tiny-llama             | 1      | 512            | 4                | 4xA10G  | 2.57 min         | $0.2 | 14.10 GB    | 1.088           | 2.968                 | 24.5%           |
+| tiny-llama/lora.yaml              | tiny-llama             | 3      | 512            | 8                | 1xA10G  | 8.09 min         | $0.2 | 13.50 GB    | 1.039           | 2.826                 | 25.5%           |
+| tiny-llama/qlora.yaml             | tiny-llama             | 3      | 512            | 8                | 1xA10G  | 8.70 min         | $0.3 | 16.24 GB    | 1.056           | 2.874                 | 25.3%           |
+| tiny-llama/qlora.yaml             | tiny-llama             | 3      | 512            | 8                | 4xA10G  | 8.70 min         | $0.7 | 16.24 GB    | 1.056           | 2.874                 | 25.4%           |
 
 *OOM = Out of memory
 

From 22883368b1cffea8c8034c81e18950982420d419 Mon Sep 17 00:00:00 2001
From: Luca Antiga <luca@lightning.ai>
Date: Sat, 27 Apr 2024 13:59:28 -0400
Subject: [PATCH 05/21] Add Mixtral MoE to README (#1365)

---
 README.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 9cba60e755..19e0d89a88 100644
--- a/README.md
+++ b/README.md
@@ -52,10 +52,11 @@ LitGPT has 🤯 **custom, from-scratch implementations** of [20+ LLMs](tutorials
 
 | Model | Model size | Author | Reference |
 |----|----|----|----|
-| Llama 3 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                                                     |
-| Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)                                                                      |
+| Llama 3 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) |
+| Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288) |
 | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) |
-| Mistral | 7B | Mistral AI | [Mistral website](https://mistral.ai/)                                                                                       |
+| Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                                                      |
+| Mistral | 7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                                                         |
 | CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) |
 | ... | ... | ... | ...   |
 
@@ -78,7 +79,8 @@ LitGPT has 🤯 **custom, from-scratch implementations** of [20+ LLMs](tutorials
 | Llama 2 | 7B, 13B, 70B | Meta AI | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)                                                                      |
 | Llama 3 | 8B, 70B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                                                     |
 | LongChat | 7B, 13B | LMSYS | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/)                                                            |
-| Mistral | 7B | Mistral AI | [Mistral website](https://mistral.ai/)                                                                                       |
+| Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                                                      |
+| Mistral | 7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                                                         |
 | Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch)                                                                              |
 | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                                             |
 | Phi | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                           |

From 3bd08dd560fd0b7da6dfa72c95b54c6042c4d93f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 29 Apr 2024 19:42:34 +0200
Subject: [PATCH 06/21] Add link to Studio for benchmarks (#1370)

Co-authored-by: Sebastian Raschka <mail@sebastianraschka.com>
---
 config_hub/finetune/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config_hub/finetune/README.md b/config_hub/finetune/README.md
index 6e78528c52..55b3d8d286 100644
--- a/config_hub/finetune/README.md
+++ b/config_hub/finetune/README.md
@@ -2,7 +2,7 @@
 
 The table below lists the performances you can expect from the provided config files. Note that you can achieve lower memory consumption by lowering the micro batch size as needed. In addition, you can lower the rank (`lora_r`) in the LoRA configuration files and disable LoRA for certain layers (for example, setting `lora_projection` and other LoRA layer-specific parameters to `false`).
 For more information, see the [Dealing with out-of-memory (OOM) errors](../../tutorials/oom.md) on lowering the memory requirements.
-The "Cost" column refers to the on-demand compute cost on [Lightning AI](https://lightning.ai) where these benchmarks were executed.
+The "Cost" column refers to the on-demand compute cost on [Lightning AI Studios where these benchmarks were executed](https://lightning.ai/lightning-ai/studios/automated-benchmarks-for-litgpt).
 All experiments were conducted using bfloat-16 precision on the Alpaca2k dataset. The "Multitask score" refers to [MMLU](https://arxiv.org/abs/2009.03300).
 
 &nbsp;

From fe1374ce0e9b3507fb5e2cb5333723c7f0a62b7d Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Mon, 29 Apr 2024 13:04:45 -0500
Subject: [PATCH 07/21] Print initial validation loss + final validation loss
 (#1228)

---
 config_hub/finetune/falcon-7b/lora.yaml               | 3 +++
 config_hub/finetune/falcon-7b/qlora.yaml              | 3 +++
 config_hub/finetune/gemma-2b/full.yaml                | 3 +++
 config_hub/finetune/gemma-2b/lora.yaml                | 3 +++
 config_hub/finetune/gemma-2b/qlora.yaml               | 3 +++
 config_hub/finetune/gemma-7b/lora.yaml                | 3 +++
 config_hub/finetune/gemma-7b/qlora.yaml               | 3 +++
 config_hub/finetune/llama-2-7b/full.yaml              | 3 +++
 config_hub/finetune/llama-2-7b/lora.yaml              | 3 +++
 config_hub/finetune/llama-2-7b/qlora.yaml             | 3 +++
 config_hub/finetune/llama-3-8b/full.yaml              | 3 +++
 config_hub/finetune/llama-3-8b/lora.yaml              | 3 +++
 config_hub/finetune/llama-3-8b/qlora.yaml             | 3 +++
 config_hub/finetune/mistral-7b-v0.2/lora.yaml         | 3 +++
 config_hub/finetune/mistral-7b-v0.2/qlora.yaml        | 3 +++
 config_hub/finetune/mistral-7b/lora.yaml              | 3 +++
 config_hub/finetune/mistral-7b/qlora.yaml             | 3 +++
 config_hub/finetune/phi-2/full.yaml                   | 3 +++
 config_hub/finetune/phi-2/lora.yaml                   | 3 +++
 config_hub/finetune/phi-2/qlora.yaml                  | 3 +++
 config_hub/finetune/stablelm-base-alpha-3b/full.yaml  | 3 +++
 config_hub/finetune/stablelm-base-alpha-3b/lora.yaml  | 3 +++
 config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml | 3 +++
 config_hub/finetune/tiny-llama/full.yaml              | 3 +++
 config_hub/finetune/tiny-llama/lora.yaml              | 3 +++
 config_hub/finetune/tiny-llama/qlora.yaml             | 3 +++
 config_hub/pretrain/debug.yaml                        | 3 +++
 config_hub/pretrain/tinyllama.yaml                    | 3 +++
 config_hub/pretrain/tinystories.yaml                  | 3 +++
 litgpt/args.py                                        | 2 ++
 litgpt/finetune/adapter.py                            | 8 ++++++--
 litgpt/finetune/adapter_v2.py                         | 8 ++++++--
 litgpt/finetune/full.py                               | 9 +++++++--
 litgpt/finetune/lora.py                               | 8 ++++++--
 litgpt/pretrain.py                                    | 9 +++++++--
 35 files changed, 121 insertions(+), 10 deletions(-)

diff --git a/config_hub/finetune/falcon-7b/lora.yaml b/config_hub/finetune/falcon-7b/lora.yaml
index eab0954182..c45b0fed94 100644
--- a/config_hub/finetune/falcon-7b/lora.yaml
+++ b/config_hub/finetune/falcon-7b/lora.yaml
@@ -114,6 +114,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/falcon-7b/qlora.yaml b/config_hub/finetune/falcon-7b/qlora.yaml
index dfc5377bd8..33ab9d9fc3 100644
--- a/config_hub/finetune/falcon-7b/qlora.yaml
+++ b/config_hub/finetune/falcon-7b/qlora.yaml
@@ -116,6 +116,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/gemma-2b/full.yaml b/config_hub/finetune/gemma-2b/full.yaml
index 77f20658ca..879f1afee9 100644
--- a/config_hub/finetune/gemma-2b/full.yaml
+++ b/config_hub/finetune/gemma-2b/full.yaml
@@ -85,6 +85,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/gemma-2b/lora.yaml b/config_hub/finetune/gemma-2b/lora.yaml
index c9f912a47c..91af82800d 100644
--- a/config_hub/finetune/gemma-2b/lora.yaml
+++ b/config_hub/finetune/gemma-2b/lora.yaml
@@ -115,6 +115,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/gemma-2b/qlora.yaml b/config_hub/finetune/gemma-2b/qlora.yaml
index dc15fe90d3..159ae2cc86 100644
--- a/config_hub/finetune/gemma-2b/qlora.yaml
+++ b/config_hub/finetune/gemma-2b/qlora.yaml
@@ -115,6 +115,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/gemma-7b/lora.yaml b/config_hub/finetune/gemma-7b/lora.yaml
index d7d56f5b5c..59120c5d0b 100644
--- a/config_hub/finetune/gemma-7b/lora.yaml
+++ b/config_hub/finetune/gemma-7b/lora.yaml
@@ -115,6 +115,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/gemma-7b/qlora.yaml b/config_hub/finetune/gemma-7b/qlora.yaml
index 7d4a2c634c..556fba0cf5 100644
--- a/config_hub/finetune/gemma-7b/qlora.yaml
+++ b/config_hub/finetune/gemma-7b/qlora.yaml
@@ -115,6 +115,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/llama-2-7b/full.yaml b/config_hub/finetune/llama-2-7b/full.yaml
index 10e439b2de..99de788c74 100644
--- a/config_hub/finetune/llama-2-7b/full.yaml
+++ b/config_hub/finetune/llama-2-7b/full.yaml
@@ -88,6 +88,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/llama-2-7b/lora.yaml b/config_hub/finetune/llama-2-7b/lora.yaml
index 91f326757a..594b2f924d 100644
--- a/config_hub/finetune/llama-2-7b/lora.yaml
+++ b/config_hub/finetune/llama-2-7b/lora.yaml
@@ -114,6 +114,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/llama-2-7b/qlora.yaml b/config_hub/finetune/llama-2-7b/qlora.yaml
index a3b7cb8dde..106b9422f4 100644
--- a/config_hub/finetune/llama-2-7b/qlora.yaml
+++ b/config_hub/finetune/llama-2-7b/qlora.yaml
@@ -116,6 +116,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/llama-3-8b/full.yaml b/config_hub/finetune/llama-3-8b/full.yaml
index 11aebcb155..e06d037710 100644
--- a/config_hub/finetune/llama-3-8b/full.yaml
+++ b/config_hub/finetune/llama-3-8b/full.yaml
@@ -88,6 +88,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/llama-3-8b/lora.yaml b/config_hub/finetune/llama-3-8b/lora.yaml
index 700a3b62f4..1d874a0690 100644
--- a/config_hub/finetune/llama-3-8b/lora.yaml
+++ b/config_hub/finetune/llama-3-8b/lora.yaml
@@ -114,6 +114,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/llama-3-8b/qlora.yaml b/config_hub/finetune/llama-3-8b/qlora.yaml
index 1da95eaac5..33a0fc98be 100644
--- a/config_hub/finetune/llama-3-8b/qlora.yaml
+++ b/config_hub/finetune/llama-3-8b/qlora.yaml
@@ -116,6 +116,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/mistral-7b-v0.2/lora.yaml b/config_hub/finetune/mistral-7b-v0.2/lora.yaml
index aad8f7c986..f56e34c525 100644
--- a/config_hub/finetune/mistral-7b-v0.2/lora.yaml
+++ b/config_hub/finetune/mistral-7b-v0.2/lora.yaml
@@ -114,6 +114,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/mistral-7b-v0.2/qlora.yaml b/config_hub/finetune/mistral-7b-v0.2/qlora.yaml
index e2f5c3aafc..b648b24d72 100644
--- a/config_hub/finetune/mistral-7b-v0.2/qlora.yaml
+++ b/config_hub/finetune/mistral-7b-v0.2/qlora.yaml
@@ -116,6 +116,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/mistral-7b/lora.yaml b/config_hub/finetune/mistral-7b/lora.yaml
index adfed6b08d..e991ec424e 100644
--- a/config_hub/finetune/mistral-7b/lora.yaml
+++ b/config_hub/finetune/mistral-7b/lora.yaml
@@ -114,6 +114,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/mistral-7b/qlora.yaml b/config_hub/finetune/mistral-7b/qlora.yaml
index 7972048f46..e43b745bb8 100644
--- a/config_hub/finetune/mistral-7b/qlora.yaml
+++ b/config_hub/finetune/mistral-7b/qlora.yaml
@@ -116,6 +116,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/phi-2/full.yaml b/config_hub/finetune/phi-2/full.yaml
index 65040a393e..5b302a48ac 100644
--- a/config_hub/finetune/phi-2/full.yaml
+++ b/config_hub/finetune/phi-2/full.yaml
@@ -88,6 +88,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/phi-2/lora.yaml b/config_hub/finetune/phi-2/lora.yaml
index a3f348c8b2..2571bc02d0 100644
--- a/config_hub/finetune/phi-2/lora.yaml
+++ b/config_hub/finetune/phi-2/lora.yaml
@@ -115,6 +115,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/phi-2/qlora.yaml b/config_hub/finetune/phi-2/qlora.yaml
index aa2c36d40a..d48d910939 100644
--- a/config_hub/finetune/phi-2/qlora.yaml
+++ b/config_hub/finetune/phi-2/qlora.yaml
@@ -115,6 +115,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/stablelm-base-alpha-3b/full.yaml b/config_hub/finetune/stablelm-base-alpha-3b/full.yaml
index bd68af8714..c196fcc017 100644
--- a/config_hub/finetune/stablelm-base-alpha-3b/full.yaml
+++ b/config_hub/finetune/stablelm-base-alpha-3b/full.yaml
@@ -85,6 +85,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml b/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml
index e674cc8419..6e52ea2175 100644
--- a/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml
+++ b/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml
@@ -114,6 +114,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml b/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml
index 27b579cbd8..ebd2f098eb 100644
--- a/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml
+++ b/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml
@@ -116,6 +116,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/tiny-llama/full.yaml b/config_hub/finetune/tiny-llama/full.yaml
index 4bc09e460b..fe1d1ef99d 100644
--- a/config_hub/finetune/tiny-llama/full.yaml
+++ b/config_hub/finetune/tiny-llama/full.yaml
@@ -85,6 +85,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/tiny-llama/lora.yaml b/config_hub/finetune/tiny-llama/lora.yaml
index 4991900954..c42ff28ff3 100644
--- a/config_hub/finetune/tiny-llama/lora.yaml
+++ b/config_hub/finetune/tiny-llama/lora.yaml
@@ -115,6 +115,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/finetune/tiny-llama/qlora.yaml b/config_hub/finetune/tiny-llama/qlora.yaml
index 1e8cf20b8a..7e80e4d0ca 100644
--- a/config_hub/finetune/tiny-llama/qlora.yaml
+++ b/config_hub/finetune/tiny-llama/qlora.yaml
@@ -115,6 +115,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 logger_name: csv
 
diff --git a/config_hub/pretrain/debug.yaml b/config_hub/pretrain/debug.yaml
index bbe2fee2cc..e89dda3cc9 100644
--- a/config_hub/pretrain/debug.yaml
+++ b/config_hub/pretrain/debug.yaml
@@ -88,6 +88,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
 devices: auto
 
diff --git a/config_hub/pretrain/tinyllama.yaml b/config_hub/pretrain/tinyllama.yaml
index a47bd946f3..e2418a5b17 100644
--- a/config_hub/pretrain/tinyllama.yaml
+++ b/config_hub/pretrain/tinyllama.yaml
@@ -88,6 +88,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
 devices: auto
 
diff --git a/config_hub/pretrain/tinystories.yaml b/config_hub/pretrain/tinystories.yaml
index 8ef1232862..8ed53a09d7 100644
--- a/config_hub/pretrain/tinystories.yaml
+++ b/config_hub/pretrain/tinystories.yaml
@@ -104,6 +104,9 @@ eval:
   # Number of iterations (type: int, default: 100)
   max_iters: 100
 
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
 # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
 devices: auto
 
diff --git a/litgpt/args.py b/litgpt/args.py
index b227ffe3f6..7e277fe9e6 100644
--- a/litgpt/args.py
+++ b/litgpt/args.py
@@ -79,3 +79,5 @@ class EvalArgs:
     """Number of tokens to generate"""
     max_iters: int = 100
     """Number of iterations"""
+    initial_validation: bool = False
+    """Whether to evaluate on the validation set at the beginning of the training"""
diff --git a/litgpt/finetune/adapter.py b/litgpt/finetune/adapter.py
index be21af318d..313d0ea8e7 100644
--- a/litgpt/finetune/adapter.py
+++ b/litgpt/finetune/adapter.py
@@ -220,7 +220,12 @@ def fit(
         f" {model.max_seq_length} and context length is {model.config.block_size}"
     )
 
-    validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2))  # sanity check
+    if eval.initial_validation:
+        val_loss = validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=len(val_dataloader)))
+        val_loss = f"{val_loss:.3f}"
+    else:
+        validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2))  # sanity check
+        val_loss = "n/a"
 
     train_iterator = CycleIterator(train_dataloader)
     throughput = ThroughputMonitor(fabric, window_size=50)
@@ -232,7 +237,6 @@ def fit(
     iter_num = 0
     total_lengths = 0
     total_t0 = time.perf_counter()
-    val_loss = "n/a"
 
     while step_count < max_steps and train_iterator.epoch < train.epochs:
         iter_num += 1
diff --git a/litgpt/finetune/adapter_v2.py b/litgpt/finetune/adapter_v2.py
index f354decfd0..39b2a2d0e2 100644
--- a/litgpt/finetune/adapter_v2.py
+++ b/litgpt/finetune/adapter_v2.py
@@ -220,7 +220,12 @@ def fit(
         f" {model.max_seq_length} and context length is {model.config.block_size}"
     )
 
-    validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2))  # sanity check
+    if eval.initial_validation:
+        val_loss = validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=len(val_dataloader)))
+        val_loss = f"{val_loss:.3f}"
+    else:
+        validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2))  # sanity check
+        val_loss = "n/a"
 
     train_iterator = CycleIterator(train_dataloader)
     throughput = ThroughputMonitor(fabric, window_size=50)
@@ -232,7 +237,6 @@ def fit(
     iter_num = 0
     total_lengths = 0
     total_t0 = time.perf_counter()
-    val_loss = "n/a"
 
     while step_count < max_steps and train_iterator.epoch < train.epochs:
         iter_num += 1
diff --git a/litgpt/finetune/full.py b/litgpt/finetune/full.py
index 23de9b622c..01db855189 100644
--- a/litgpt/finetune/full.py
+++ b/litgpt/finetune/full.py
@@ -194,7 +194,13 @@ def fit(
         f" {model.max_seq_length} and context length is {model.config.block_size}"
     )
 
-    validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2))  # sanity check
+    if eval.initial_validation:
+        val_loss = validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=len(val_dataloader)))
+        val_loss = f"{val_loss:.3f}"
+    else:
+        validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2))  # sanity check
+        val_loss = "n/a"
+
     initial_iter = state["iter_num"]
     max_steps = train.max_steps or float("inf")
     train_iterator = CycleIterator(train_dataloader)
@@ -216,7 +222,6 @@ def fit(
         fabric.device
     )
     fabric.barrier()
-    val_loss = "n/a"
 
     while state["step_count"] < max_steps and train_iterator.epoch < train.epochs:
         state["iter_num"] += 1
diff --git a/litgpt/finetune/lora.py b/litgpt/finetune/lora.py
index 39e805befe..ae48bbc8fe 100644
--- a/litgpt/finetune/lora.py
+++ b/litgpt/finetune/lora.py
@@ -251,7 +251,12 @@ def fit(
         f" {model.max_seq_length} and context length is {model.config.block_size}"
     )
 
-    validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2))  # sanity check
+    if eval.initial_validation:
+        val_loss = validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=len(val_dataloader)))
+        val_loss = f"{val_loss:.3f}"
+    else:
+        validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2))  # sanity check
+        val_loss = "n/a"
 
     train_iterator = CycleIterator(train_dataloader)
     throughput = ThroughputMonitor(fabric, window_size=50)
@@ -263,7 +268,6 @@ def fit(
     iter_num = 0
     total_lengths = 0
     total_t0 = time.perf_counter()
-    val_loss = "n/a"
 
     while step_count < max_steps and train_iterator.epoch < train.epochs:
         iter_num += 1
diff --git a/litgpt/pretrain.py b/litgpt/pretrain.py
index 3a763116a0..d5014dc022 100644
--- a/litgpt/pretrain.py
+++ b/litgpt/pretrain.py
@@ -228,7 +228,13 @@ def fit(
     model = state["model"]
     optimizer = state["optimizer"]
 
-    validate(fabric, model, val_dataloader, max_iters=2)  # sanity check
+    if eval.initial_validation:
+        val_loss = validate(fabric, model, val_dataloader, max_iters=eval.max_iters)
+        val_loss = f"{val_loss:.3f}"
+    else:
+        validate(fabric, model, val_dataloader, max_iters=2)   # sanity check
+        val_loss = "n/a"
+
     throughput = ThroughputMonitor(fabric, window_size=5)
 
     with torch.device("meta"):
@@ -252,7 +258,6 @@ def fit(
     )
     fabric.barrier()
     total_t0 = time.perf_counter()
-    val_loss = "n/a"
 
     warmup_iters = train.warmup_iters(devices, max_iters, train_dataloader)
 

From 60140755d7dbb89cf899b23404c783c99a46ac86 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Mon, 29 Apr 2024 13:22:47 -0500
Subject: [PATCH 08/21] Change examples to phi-2 (#1371)

---
 README.md | 60 +++++++++++++++++++++++++++----------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 19e0d89a88..6bca484f1a 100644
--- a/README.md
+++ b/README.md
@@ -33,22 +33,22 @@ Uses the latest state-of-the-art techniques:
 <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/GithubLitGPTDAG2.png" alt="LitGPT steps" width="auto"/>
 &nbsp;
 
-# Finetune, pretrain and deploy LLMs Lightning fast ⚡⚡   
+# Finetune, pretrain and deploy LLMs Lightning fast ⚡⚡
 LitGPT is a command-line tool designed to easily [finetune](#finetune-an-llm), [pretrain](#pretrain-an-llm), [evaluate](#use-an-llm), and [deploy](#deploy-an-llm) [20+ LLMs](#choose-from-20-llms) **on your own data**. It features highly-optimized [training recipes](#training-recipes) for the world's most powerful open-source large language models (LLMs).
 
-We reimplemented all model architectures and training recipes from scratch for 4 reasons:   
+We reimplemented all model architectures and training recipes from scratch for 4 reasons:
 
-1. Remove all abstraction layers and have single file implementations.   
-2. Guarantee Apache 2.0 compliance to enable enterprise use without limits.    
-3. Optimized each model's architectural detail to maximize performance, reduce costs, and speed up training.    
-4. Highly-optimized [recipe configs](#training-recipes) we have tested at enterprise scale.               
+1. Remove all abstraction layers and have single file implementations.
+2. Guarantee Apache 2.0 compliance to enable enterprise use without limits.
+3. Optimized each model's architectural detail to maximize performance, reduce costs, and speed up training.
+4. Highly-optimized [recipe configs](#training-recipes) we have tested at enterprise scale.
 
 ---
 
 &nbsp;
 
 # Choose from 20+ LLMs
-LitGPT has 🤯 **custom, from-scratch implementations** of [20+ LLMs](tutorials/download_model_weights.md) without layers of abstraction:   
+LitGPT has 🤯 **custom, from-scratch implementations** of [20+ LLMs](tutorials/download_model_weights.md) without layers of abstraction:
 
 | Model | Model size | Author | Reference |
 |----|----|----|----|
@@ -63,10 +63,10 @@ LitGPT has 🤯 **custom, from-scratch implementations** of [20+ LLMs](tutorials
 <details>
   <summary>See full list of 20+ LLMs</summary>
 
-&nbsp; 
+&nbsp;
 
 #### All models
-  
+
 | Model | Model size | Author | Reference |
 |----|----|----|----|
 | CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) |
@@ -123,36 +123,36 @@ pip install -e '.[all]'
 
 &nbsp;
 # Quick start
-After installing LitGPT, select the model and action you want to take on that model (finetune, pretrain, evaluate, deploy, etc...):    
+After installing LitGPT, select the model and action you want to take on that model (finetune, pretrain, evaluate, deploy, etc...):
 
 ```bash
 # ligpt [action] [model]
 litgpt  download  meta-llama/Meta-Llama-3-8B-Instruct
 litgpt  chat      meta-llama/Meta-Llama-3-8B-Instruct
-litgpt  finetune  meta-llama/Meta-Llama-3-8B-Instruct    
-litgpt  pretrain  meta-llama/Meta-Llama-3-8B-Instruct    
-litgpt  serve     meta-llama/Meta-Llama-3-8B-Instruct    
+litgpt  finetune  meta-llama/Meta-Llama-3-8B-Instruct
+litgpt  pretrain  meta-llama/Meta-Llama-3-8B-Instruct
+litgpt  serve     meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 &nbsp;
 
 ###  Use an LLM for inference
-Use LLMs for inference to test its chatting capabilities, run evaluations, or extract embeddings, etc...     
-Here's an example showing how to use the Mistral 7B LLM.
+Use LLMs for inference to test its chatting capabilities, run evaluations, or extract embeddings, etc...
+Here's an example showing how to use the Phi-2 LLM.
 
 <a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-chat">
   <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg" alt="Open In Studio"/>
 </a>
 
-&nbsp;    
+&nbsp;
 
 ```bash
 # 1) Download a pretrained model
-litgpt download --repo_id mistralai/Mistral-7B-Instruct-v0.2
+litgpt download --repo_id microsoft/phi-2
 
 # 2) Chat with the model
 litgpt chat \
-  --checkpoint_dir checkpoints/mistralai/Mistral-7B-Instruct-v0.2
+  --checkpoint_dir checkpoints/microsoft/phi-2
 
 >> Prompt: What do Llamas eat?
 ```
@@ -168,7 +168,7 @@ For more information, refer to the [download](tutorials/download_model_weights.m
   <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg" alt="Open In Studio"/>
 </a>
 
-&nbsp; 
+&nbsp;
 
 ```bash
 # 1) Download a pretrained model
@@ -191,14 +191,14 @@ litgpt chat \
 
 &nbsp;
 
-### Pretrain an LLM   
+### Pretrain an LLM
 Train an LLM from scratch on your own data via pretraining:
 
 <a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-pretrain">
 <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg"; alt="Open In Studio"/>
 </a>
 
-&nbsp; 
+&nbsp;
 
 ```bash
 mkdir -p custom_texts
@@ -226,15 +226,15 @@ litgpt chat \
 
 &nbsp;
 
-### Continue pretraining an LLM       
-This is another way of finetuning that specializes an already pretrained model by training on custom data:    
+### Continue pretraining an LLM
+This is another way of finetuning that specializes an already pretrained model by training on custom data:
 
 
 <a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-continue-pretraining">
 <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg"; alt="Open In Studio"/>
 </a>
 
-&nbsp; 
+&nbsp;
 
 ```bash
 mkdir -p custom_texts
@@ -262,7 +262,7 @@ litgpt chat \
 &nbsp;
 
 ### Deploy an LLM
-Once you're ready to deploy a finetuned LLM, run this command:   
+Once you're ready to deploy a finetuned LLM, run this command:
 
 <a target="_blank" href="https://lightning.ai/lightning-ai/studios/litgpt-serve">
   <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg" alt="Open In Studio"/>
@@ -271,20 +271,20 @@ Once you're ready to deploy a finetuned LLM, run this command:
 &nbsp;
 
 ```bash
-# locate the checkpoint to your finetuned or pretrained model and call the `serve` command:  
+# locate the checkpoint to your finetuned or pretrained model and call the `serve` command:
 litgpt serve --checkpoint_dir path/to/your/checkpoint/microsoft/phi-2
 
-# Alternative: if you haven't finetuned, download any checkpoint to deploy it:     
+# Alternative: if you haven't finetuned, download any checkpoint to deploy it:
 litgpt download --repo_id microsoft/phi-2
 litgpt serve --checkpoint_dir checkpoints/microsoft/phi-2
 ```
 
-Test the server in a separate terminal and integrate the model API into your AI product:    
+Test the server in a separate terminal and integrate the model API into your AI product:
 ```python
 # 3) Use the server (in a separate session)
 import requests, json
  response = requests.post(
-     "http://127.0.0.1:8000/predict", 
+     "http://127.0.0.1:8000/predict",
      json={"prompt": "Fix typos in the following sentence: Exampel input"}
 )
 print(response.json()["output"])
@@ -493,7 +493,7 @@ litgpt finetune \
 
 &nbsp;
 
-# Community        
+# Community
 
 ## Get involved!
 

From 47806045a376cd4dbf48d6bfc41c97659239de4d Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylor.robie@lightning.ai>
Date: Mon, 29 Apr 2024 15:49:35 -0700
Subject: [PATCH 09/21] Eliminate cuda syncs (#1374)

---
 litgpt/lora.py     | 24 ++++++++++++++++--------
 litgpt/utils.py    |  9 +++++++--
 tests/test_lora.py |  6 +++---
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/litgpt/lora.py b/litgpt/lora.py
index 51fd66713d..8fee63cbb6 100644
--- a/litgpt/lora.py
+++ b/litgpt/lora.py
@@ -264,18 +264,22 @@ def __init__(
             total_qkv = q_per_kv + 2
             head_size = out_features // (self.n_query_groups * total_qkv)
             ind = range(out_features)
-            self.lora_ind = []
+            lora_ind = []
             if enable_q:
                 q_ind = [x for x in ind if (x // head_size) % total_qkv < total_qkv - 2]
-                self.lora_ind.extend(q_ind)
+                lora_ind.extend(q_ind)
             if enable_k:
                 k_ind = [x for x in ind if (x // head_size) % total_qkv == total_qkv - 2]
-                self.lora_ind.extend(k_ind)
+                lora_ind.extend(k_ind)
             if enable_v:
                 v_ind = [x for x in ind if (x // head_size) % total_qkv == total_qkv - 1]
-                self.lora_ind.extend(v_ind)
+                lora_ind.extend(v_ind)
+            self._lora_ind = torch.tensor(lora_ind)
+            self._lora_ind_cache = {self._lora_ind.device: self._lora_ind}
             self.reset_parameters()
 
+
+
     def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
         """Properly pad weight updates with zeros.
 
@@ -328,15 +332,19 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
         # ⚬ enable_lora: [True, False, True]
         # Then x has embeddings_size of 256 (2 * 128 as enable_lora only for query and value, not keys) and expected
         # embeddings_size is 384 (self.linear.out_features), so that means that we need to pad from 256 to 384 with zeros, but
-        # only for key updates (this is where self.lora_ind comes in handy)
+        # only for key updates (this is where lora_ind comes in handy)
         # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors
         # for example when we want to merge/unmerge LoRA weights and pretrained weights
         x = x.transpose(0, 1)
         result = x.new_zeros((*x.shape[:-1], self.linear.out_features))  # (64, 64, 384)
         result = result.view(-1, self.linear.out_features)  # (4096, 384)
-        result = result.index_copy(
-            1, torch.tensor(self.lora_ind, device=result.device), x.reshape(-1, sum(self.qkv_shapes))
-        )  # (4096, 256)
+
+        # `lora_ind` is constant, so we want to avoid copying it (and incurring an expensive cudaStreamSynchronize)
+        # every time this method is called. So instead we simply cache a copy on each device that needs it.
+        if (lora_ind := self._lora_ind_cache.get(result.device)) is None:
+            self._lora_ind_cache[result.device] = lora_ind = self._lora_ind.to(result.device)
+
+        result = result.index_copy(1, lora_ind, x.reshape(-1, sum(self.qkv_shapes)))  # (4096, 256)
         return result.view((*x.shape[:-1], self.linear.out_features)).transpose(0, 1)  # (64, 64, 384)
 
     def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
diff --git a/litgpt/utils.py b/litgpt/utils.py
index 21f7f34a98..6eb7efbff4 100644
--- a/litgpt/utils.py
+++ b/litgpt/utils.py
@@ -272,7 +272,8 @@ def chunked_cross_entropy(
             for logit_chunk, target_chunk in zip(logit_chunks, target_chunks)
         ]
         non_masked_elems = (targets != ignore_index).sum()
-        return torch.cat(loss_chunks).sum() / max(1, non_masked_elems)
+        # See [non_masked_elems div note]
+        return torch.cat(loss_chunks).sum() / non_masked_elems.maximum(torch.ones_like(non_masked_elems))
 
     # no chunking at all
     logits = logits.reshape(-1, logits.size(-1))
@@ -288,7 +289,11 @@ def chunked_cross_entropy(
         for logit_chunk, target_chunk in zip(logit_chunks, target_chunks)
     ]
     non_masked_elems = (targets != ignore_index).sum()
-    return torch.cat(loss_chunks).sum() / max(1, non_masked_elems)
+    # [non_masked_elems div note]:
+    #   max(1, non_masked_elems) would be more ergonomic to avoid a division by zero. However that
+    #   results in a python int which is then passed back to torch division. By using the
+    #   `x.maximum(torch.ones_like(x))` pattern we avoid a cudaStreamSynchronize.
+    return torch.cat(loss_chunks).sum() / non_masked_elems.maximum(torch.ones_like(non_masked_elems))
 
 
 def map_old_state_dict_weights(state_dict: Dict, mapping: Mapping, prefix: str) -> Dict:
diff --git a/tests/test_lora.py b/tests/test_lora.py
index f8764c39bb..c09d07ee66 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -107,7 +107,7 @@ def test_lora_mqa_gqa():
     assert attn.linear.weight.shape == (24, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (16, 2)
-    assert attn.lora_ind == lora_ind
+    torch.testing.assert_allclose(attn._lora_ind, torch.tensor(lora_ind))
     x = torch.randint(0, 8, size=(3, 5, 16), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 24)
     bsz, ctx_len, in_dim = 2, 30, 8
@@ -128,7 +128,7 @@ def test_lora_mqa_gqa():
     assert attn.linear.weight.shape == (12, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (10, 2)
-    assert attn.lora_ind == lora_ind
+    torch.testing.assert_allclose(attn._lora_ind, torch.tensor(lora_ind))
     x = torch.randint(0, 8, size=(3, 5, 10), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 12)
     bsz, ctx_len, in_dim = 2, 30, 8
@@ -149,7 +149,7 @@ def test_lora_mqa_gqa():
     assert attn.linear.weight.shape == (16, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (12, 2)
-    assert attn.lora_ind == lora_ind
+    torch.testing.assert_allclose(attn._lora_ind, torch.tensor(lora_ind))
     x = torch.randint(0, 8, size=(3, 5, 12), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 16)
     bsz, ctx_len, in_dim = 2, 30, 8

From 96836008be96fa2fe5e6909a3fd7a112cc57716e Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Tue, 30 Apr 2024 13:42:44 +0200
Subject: [PATCH 10/21] Update Lightning version (#1375)

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ba3bc7c9e9..f9a2dea996 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ license = { file = "LICENSE" }
 
 dependencies = [
     "torch>=2.2.0",
-    "lightning==2.3.0.dev20240328",
+    "lightning==2.3.0.dev20240428",
     "jsonargparse[signatures]>=4.27.6"
 ]
 

From 0ce1ca4435656068995e3848132ed419bd39fd39 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Tue, 30 Apr 2024 16:56:39 +0200
Subject: [PATCH 11/21] Update LoRA test (#1376)

---
 tests/test_lora.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_lora.py b/tests/test_lora.py
index c09d07ee66..d131411d9c 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -107,7 +107,7 @@ def test_lora_mqa_gqa():
     assert attn.linear.weight.shape == (24, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (16, 2)
-    torch.testing.assert_allclose(attn._lora_ind, torch.tensor(lora_ind))
+    assert torch.equal(attn._lora_ind, torch.tensor(lora_ind))
     x = torch.randint(0, 8, size=(3, 5, 16), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 24)
     bsz, ctx_len, in_dim = 2, 30, 8
@@ -128,7 +128,7 @@ def test_lora_mqa_gqa():
     assert attn.linear.weight.shape == (12, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (10, 2)
-    torch.testing.assert_allclose(attn._lora_ind, torch.tensor(lora_ind))
+    assert torch.equal(attn._lora_ind, torch.tensor(lora_ind))
     x = torch.randint(0, 8, size=(3, 5, 10), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 12)
     bsz, ctx_len, in_dim = 2, 30, 8
@@ -149,7 +149,7 @@ def test_lora_mqa_gqa():
     assert attn.linear.weight.shape == (16, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (12, 2)
-    torch.testing.assert_allclose(attn._lora_ind, torch.tensor(lora_ind))
+    assert torch.equal(attn._lora_ind, torch.tensor(lora_ind))
     x = torch.randint(0, 8, size=(3, 5, 12), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 16)
     bsz, ctx_len, in_dim = 2, 30, 8

From 47445127e15cd1ab7157855cfd0ac1b77e9e1f70 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Tue, 30 Apr 2024 10:38:07 -0500
Subject: [PATCH 12/21] More informative download error messages (#1373)

---
 litgpt/scripts/download.py | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/litgpt/scripts/download.py b/litgpt/scripts/download.py
index 13c0ca377e..f360c0b3b0 100644
--- a/litgpt/scripts/download.py
+++ b/litgpt/scripts/download.py
@@ -73,7 +73,7 @@ def download_from_hub(
         download.HF_HUB_ENABLE_HF_TRANSFER = True
 
     directory = checkpoint_dir / repo_id
-    with gated_repo_catcher(repo_id):
+    with gated_repo_catcher(repo_id, access_token):
         snapshot_download(
             repo_id,
             local_dir=directory,
@@ -111,7 +111,7 @@ def find_weight_files(repo_id: str, access_token: Optional[str]) -> Tuple[List[s
     from huggingface_hub import repo_info
     from huggingface_hub.utils import filter_repo_objects
 
-    with gated_repo_catcher(repo_id):
+    with gated_repo_catcher(repo_id, access_token):
         info = repo_info(repo_id, token=access_token)
     filenames = [f.rfilename for f in info.siblings]
     bins = list(filter_repo_objects(items=filenames, allow_patterns=["*.bin*"]))
@@ -120,17 +120,30 @@ def find_weight_files(repo_id: str, access_token: Optional[str]) -> Tuple[List[s
 
 
 @contextmanager
-def gated_repo_catcher(repo_id: str):
+def gated_repo_catcher(repo_id: str, access_token: Optional[str]):
     try:
         yield
     except OSError as e:
-        if "gated repo" in str(e):
+        err_msg = str(e)
+        if "Repository Not Found" in err_msg:
             raise ValueError(
-                f"https://huggingface.co/{repo_id} requires authentication, please set the `HF_TOKEN=your_token`"
-                " environment variable or pass --access_token=your_token. You can find your token by visiting"
-                " https://huggingface.co/settings/tokens"
-            )
-        raise e
+                f"Repository at https://huggingface.co/api/models/{repo_id} not found."
+                " Please make sure you specified the correct `repo_id`."
+            ) from None
+        elif "gated repo" in err_msg:
+            if not access_token:
+                raise ValueError(
+                    f"https://huggingface.co/{repo_id} requires authentication, please set the `HF_TOKEN=your_token`"
+                    " environment variable or pass `--access_token=your_token`. You can find your token by visiting"
+                    " https://huggingface.co/settings/tokens."
+                ) from None
+            else:
+                raise ValueError(
+                    f"https://huggingface.co/{repo_id} requires authentication. The access token provided by `HF_TOKEN=your_token`"
+                    " environment variable or `--access_token=your_token` may not have sufficient access rights. Please"
+                    f" visit https://huggingface.co/{repo_id} for more information."
+                ) from None
+        raise e from None
 
 
 if __name__ == "__main__":

From 2fa4d8207c060e558d63d1eedbd962e906d8d775 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Natan=20Lot=C3=A9rio?= <natanloterio@gmail.com>
Date: Thu, 2 May 2024 20:44:15 +0100
Subject: [PATCH 13/21] Readme.md - Instruct how to get HF_TOKEN (#1380)

Co-authored-by: rasbt <mail@sebastianraschka.com>
---
 README.md                           | 3 ++-
 tutorials/download_model_weights.md | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6bca484f1a..47a761aed4 100644
--- a/README.md
+++ b/README.md
@@ -157,7 +157,8 @@ litgpt chat \
 >> Prompt: What do Llamas eat?
 ```
 
-For more information, refer to the [download](tutorials/download_model_weights.md) and [inference](tutorials/inference.md) tutorials.
+The download of certain models requires an additional access token. You can read more about this in the [download](tutorials/download_model_weights.md#specific-models-and-access-tokens) documentation. 
+For more information on the different inference options, refer to the [inference](tutorials/inference.md) tutorial.
 
 &nbsp;
 
diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
index 45c9c7d50c..1938ffe768 100644
--- a/tutorials/download_model_weights.md
+++ b/tutorials/download_model_weights.md
@@ -231,7 +231,7 @@ litgpt chat --checkpoint_dir checkpoints/$repo_id
 ```
 
 &nbsp;
-## Specific Models
+## Specific models and access tokens
 
 Note that certain models require that you've been granted access to the weights on the Hugging Face Hub.
 

From d39b26a5ffe1588a4e0872af8b0db02858ce9a47 Mon Sep 17 00:00:00 2001
From: Federico Belotti <belo.fede@outlook.com>
Date: Fri, 3 May 2024 16:37:15 +0200
Subject: [PATCH 14/21] Feature/top p sampling (#1360)

Co-authored-by: Sebastian Raschka <mail@sebastianraschka.com>
---
 litgpt/chat/base.py             | 36 +++++++++++++++--
 litgpt/deploy/serve.py          | 19 +++++++++
 litgpt/generate/adapter.py      | 17 ++++++++-
 litgpt/generate/adapter_v2.py   | 17 ++++++++-
 litgpt/generate/base.py         | 68 ++++++++++++++++++++++++++++++---
 litgpt/generate/full.py         | 17 ++++++++-
 litgpt/generate/sequentially.py | 17 ++++++++-
 litgpt/generate/tp.py           | 15 ++++++++
 tests/test_chat.py              |  4 +-
 tests/test_generate.py          | 24 ++++++++++--
 tests/test_generate_adapter.py  |  4 +-
 11 files changed, 217 insertions(+), 21 deletions(-)

diff --git a/litgpt/chat/base.py b/litgpt/chat/base.py
index 7f2afc8f19..9fa27d2ba3 100644
--- a/litgpt/chat/base.py
+++ b/litgpt/chat/base.py
@@ -24,6 +24,7 @@ def generate(
     *,
     temperature: float = 1.0,
     top_k: Optional[int] = None,
+    top_p: float = 1.0,
     stop_tokens: Tuple[List[int], ...] = (),
 ) -> Iterator[torch.Tensor]:
     """Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as possible.
@@ -33,7 +34,21 @@ def generate(
         prompt: Tensor of shape (T) with indices of the prompt sequence.
         max_returned_tokens: The maximum number of tokens to return (given plus generated).
         temperature: Scales the predicted logits by 1 / temperature
-        top_k: If specified, only sample among the tokens with the k highest probabilities
+        top_k: If specified, only sample among the tokens with the k highest probabilities.
+        top_p: If specified, it represents the cumulative probability threshold to consider in the sampling process.
+            In top-p sampling, the next token is sampled from the highest probability tokens
+            whose cumulative probability exceeds the threshold `top_p`. When specified,
+            it must be `0 <= top_p <= 1`. Here, `top_p=0` is equivalent
+            to sampling the most probable token, while `top_p=1` samples from the whole distribution.
+            It can be used in conjunction with `top_k` and `temperature` with the following order
+            of application:
+
+            1. `top_k` sampling
+            2. `temperature` scaling
+            3. `top_p` sampling
+
+            For more details, see https://arxiv.org/abs/1904.09751
+            or https://huyenchip.com/2024/01/16/sampling.html#top_p
         stop_tokens: If specified, stop generating any more token once one of this list is generated.
     """
     T = prompt.size(0)
@@ -51,7 +66,7 @@ def generate(
     tokens = []
     token = prompt
     for t in range(1, max_returned_tokens - T + 1):
-        token = next_token(model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k)
+        token = next_token(model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k, top_p=top_p)
         tokens.append(token)
         # check the stop condition
         if any((l := len(st)) <= len(tokens) and all(a == b for a, b in zip(tokens[-l:], st)) for st in stop_tokens):
@@ -99,6 +114,7 @@ def decode(fabric: L.Fabric, tokenizer: Tokenizer, token_stream: Iterator[torch.
 def main(
     *,
     top_k: Optional[int] = 200,
+    top_p: float = 1.0,
     temperature: float = 0.8,
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-tuned-alpha-3b"),
     quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
@@ -109,6 +125,20 @@ def main(
 
     Args:
         top_k: The number of top most probable tokens to consider in the sampling process.
+        top_p: If specified, it represents the cumulative probability threshold to consider in the sampling process.
+            In top-p sampling, the next token is sampled from the highest probability tokens
+            whose cumulative probability exceeds the threshold `top_p`. When specified,
+            it must be `0 <= top_p <= 1`. Here, `top_p=0` is equivalent
+            to sampling the most probable token, while `top_p=1` samples from the whole distribution.
+            It can be used in conjunction with `top_k` and `temperature` with the following order
+            of application:
+
+            1. `top_k` sampling
+            2. `temperature` scaling
+            3. `top_p` sampling
+
+            For more details, see https://arxiv.org/abs/1904.09751
+            or https://huyenchip.com/2024/01/16/sampling.html#top_p
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
             samples.
         checkpoint_dir: The checkpoint directory to load.
@@ -175,7 +205,7 @@ def main(
         prompt = prompt_style.apply(prompt=prompt)
         encoded_prompt = tokenizer.encode(prompt, device=fabric.device)
         y = generate(
-            model, encoded_prompt, model.max_seq_length, temperature=temperature, top_k=top_k, stop_tokens=stop_tokens
+            model, encoded_prompt, model.max_seq_length, temperature=temperature, top_k=top_k, top_p=top_p, stop_tokens=stop_tokens
         )
         fabric.print(">> Reply: ", end="")
         t0 = time.perf_counter()
diff --git a/litgpt/deploy/serve.py b/litgpt/deploy/serve.py
index 9cd594230d..42ff3bdd50 100644
--- a/litgpt/deploy/serve.py
+++ b/litgpt/deploy/serve.py
@@ -29,6 +29,7 @@ def __init__(self,
                  precision: Optional[str] = None,
                  temperature: float = 0.8,
                  top_k: int = 50,
+                 top_p: float = 1.0,
                  max_new_tokens: int = 50) -> None:
 
         if not _LITSERVE_AVAILABLE:
@@ -40,6 +41,7 @@ def __init__(self,
         self.temperature = temperature
         self.top_k = top_k
         self.max_new_tokens = max_new_tokens
+        self.top_p = top_p
 
     def setup(self, device: str) -> None:
         # Setup the model so it can be called in `predict`.
@@ -90,6 +92,7 @@ def predict(self, inputs: torch.Tensor) -> Any:
             max_returned_tokens,
             temperature=self.temperature,
             top_k=self.top_k,
+            top_p=self.top_p,
             eos_id=self.tokenizer.eos_id
         )
 
@@ -108,6 +111,7 @@ def run_server(
     precision: Optional[str] = None,
     temperature: float = 0.8,
     top_k: int = 200,
+    top_p: float = 1.0,
     max_new_tokens: int = 50,
     devices: int = 1,
     accelerator: str = "auto",
@@ -123,6 +127,20 @@ def run_server(
             Values below 1 decrease randomness.
         top_k: The size of the pool of potential next tokens. Values larger than 1 result in more novel
             generated text but can also lead to more incoherent texts.
+        top_p: If specified, it represents the cumulative probability threshold to consider in the sampling process.
+            In top-p sampling, the next token is sampled from the highest probability tokens
+            whose cumulative probability exceeds the threshold `top_p`. When specified,
+            it must be `0 <= top_p <= 1`. Here, `top_p=0` is equivalent
+            to sampling the most probable token, while `top_p=1` samples from the whole distribution.
+            It can be used in conjunction with `top_k` and `temperature` with the following order
+            of application:
+
+            1. `top_k` sampling
+            2. `temperature` scaling
+            3. `top_p` sampling
+
+            For more details, see https://arxiv.org/abs/1904.09751
+            or https://huyenchip.com/2024/01/16/sampling.html#top_p
         max_new_tokens: The number of generation steps to take.
         devices: How many devices/GPUs to use.
         accelerator: The type of accelerator to use. For example, "auto", "cuda", "cpu", or "mps".
@@ -137,6 +155,7 @@ def run_server(
             precision=precision,
             temperature=temperature,
             top_k=top_k,
+            top_p=top_p,
             max_new_tokens=max_new_tokens,
             ),
         accelerator=accelerator,
diff --git a/litgpt/generate/adapter.py b/litgpt/generate/adapter.py
index 91ebd18397..10e882dbbf 100644
--- a/litgpt/generate/adapter.py
+++ b/litgpt/generate/adapter.py
@@ -24,6 +24,7 @@ def main(
     quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
     max_new_tokens: int = 100,
     top_k: Optional[int] = 50,
+    top_p: float = 1.0,
     temperature: float = 0.8,
     precision: Optional[str] = None,
 ) -> None:
@@ -42,6 +43,20 @@ def main(
             for more details, see https://github.com/Lightning-AI/litgpt/blob/main/tutorials/quantize.md
         max_new_tokens: The number of generation steps to take.
         top_k: The number of top most probable tokens to consider in the sampling process.
+        top_p: If specified, it represents the cumulative probability threshold to consider in the sampling process.
+            In top-p sampling, the next token is sampled from the highest probability tokens
+            whose cumulative probability exceeds the threshold `top_p`. When specified,
+            it must be `0 <= top_p <= 1`. Here, `top_p=0` is equivalent
+            to sampling the most probable token, while `top_p=1` samples from the whole distribution.
+            It can be used in conjunction with `top_k` and `temperature` with the following order
+            of application:
+
+            1. `top_k` sampling
+            2. `temperature` scaling
+            3. `top_p` sampling
+
+            For more details, see https://arxiv.org/abs/1904.09751
+            or https://huyenchip.com/2024/01/16/sampling.html#top_p
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
             samples.
         precision: Indicates the Fabric precision setting to use.
@@ -97,7 +112,7 @@ def main(
 
     L.seed_everything(1234)
     t0 = time.perf_counter()
-    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
+    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, top_p=top_p, eos_id=tokenizer.eos_id)
     t = time.perf_counter() - t0
 
     output = tokenizer.decode(y)
diff --git a/litgpt/generate/adapter_v2.py b/litgpt/generate/adapter_v2.py
index 0d25092135..4f6406080c 100644
--- a/litgpt/generate/adapter_v2.py
+++ b/litgpt/generate/adapter_v2.py
@@ -24,6 +24,7 @@ def main(
     quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
     max_new_tokens: int = 100,
     top_k: Optional[int] = 50,
+    top_p: float = 1.0,
     temperature: float = 0.8,
     precision: Optional[str] = None,
 ) -> None:
@@ -42,6 +43,20 @@ def main(
             for more details, see https://github.com/Lightning-AI/litgpt/blob/main/tutorials/quantize.md
         max_new_tokens: The number of generation steps to take.
         top_k: The number of top most probable tokens to consider in the sampling process.
+        top_p: If specified, it represents the cumulative probability threshold to consider in the sampling process.
+            In top-p sampling, the next token is sampled from the highest probability tokens
+            whose cumulative probability exceeds the threshold `top_p`. When specified,
+            it must be `0 <= top_p <= 1`. Here, `top_p=0` is equivalent
+            to sampling the most probable token, while `top_p=1` samples from the whole distribution.
+            It can be used in conjunction with `top_k` and `temperature` with the following order
+            of application:
+
+            1. `top_k` sampling
+            2. `temperature` scaling
+            3. `top_p` sampling
+
+            For more details, see https://arxiv.org/abs/1904.09751
+            or https://huyenchip.com/2024/01/16/sampling.html#top_p
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
             samples.
         precision: Indicates the Fabric precision setting to use.
@@ -97,7 +112,7 @@ def main(
 
     L.seed_everything(1234)
     t0 = time.perf_counter()
-    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
+    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, top_p=top_p, eos_id=tokenizer.eos_id)
     t = time.perf_counter() - t0
 
     output = tokenizer.decode(y)
diff --git a/litgpt/generate/base.py b/litgpt/generate/base.py
index 060604b43f..2b0f1b06de 100644
--- a/litgpt/generate/base.py
+++ b/litgpt/generate/base.py
@@ -24,7 +24,26 @@ def multinomial_num_samples_1(probs: torch.Tensor) -> torch.Tensor:
     return torch.multinomial(probs, num_samples=1)
 
 
-def sample(logits: torch.Tensor, temperature: float = 1.0, top_k: Optional[int] = None) -> torch.Tensor:
+def sample_top_p(logits: torch.Tensor, top_p: float) -> torch.Tensor:
+    sorted_logits, sorted_indices = torch.sort(logits, descending=False)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+    # Example:
+    # sorted_probs=[0.1, 0.15, 0.2, 0.25, 0.3] -> sorted_cumprobs=[0.1, 0.25, 0.45, 0.7, 1.0]
+    # sorted_indices_to_remove = [1, 1, 0, 0, 0] if top_p=0.7
+    sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+    # Keep at least 1 token always to prevent the case where no token is selected
+    # In this case the most probable one is always kept
+    sorted_indices_to_remove[-1:] = 0
+    indices_to_remove = sorted_indices_to_remove.scatter(0, sorted_indices, sorted_indices_to_remove)
+    logits = logits.masked_fill(indices_to_remove, float("-inf"))
+    return logits
+
+
+def sample(
+    logits: torch.Tensor, temperature: float = 1.0, top_k: Optional[int] = None, top_p: float = 1.0
+) -> torch.Tensor:
+    if top_p < 0.0 or top_p > 1.0:
+        raise ValueError(f"top_p must be in [0, 1], got {top_p}")
     logits = logits[0, -1]
     # optionally crop the logits to only the top k options
     if top_k is not None:
@@ -32,8 +51,13 @@ def sample(logits: torch.Tensor, temperature: float = 1.0, top_k: Optional[int]
         # do not use `torch.where` as in nanogpt because it will repeat top-k collisions
         logits = torch.full_like(logits, float("-inf")).scatter_(-1, i, v)
     # optionally scale the logits and sample from a probability distribution
-    if temperature > 0.0:
-        probs = torch.nn.functional.softmax(logits / temperature, dim=-1)
+    if temperature > 0.0 or top_p > 0.0:
+        if temperature > 0.0:
+            logits = logits / temperature
+        # optionally crop the logits to smallest set of logits with a cumulative probability above top_p
+        if top_p < 1.0:
+            logits = sample_top_p(logits, top_p)
+        probs = torch.nn.functional.softmax(logits, dim=-1)
         return multinomial_num_samples_1(probs)
     return torch.argmax(logits, dim=-1, keepdim=True)
 
@@ -52,6 +76,7 @@ def generate(
     *,
     temperature: float = 1.0,
     top_k: Optional[int] = None,
+    top_p: float = 1.0,
     eos_id: Optional[int] = None,
 ) -> torch.Tensor:
     """Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
@@ -64,6 +89,20 @@ def generate(
         max_returned_tokens: The maximum number of tokens to return (given plus generated).
         temperature: Scales the predicted logits by 1 / temperature.
         top_k: If specified, only sample among the tokens with the k highest probabilities.
+        top_p: If specified, it represents the cumulative probability threshold to consider in the sampling process.
+            In top-p sampling, the next token is sampled from the highest probability tokens
+            whose cumulative probability exceeds the threshold `top_p`. When specified,
+            it must be `0 <= top_p <= 1`. Here, `top_p=0` is equivalent
+            to sampling the most probable token, while `top_p=1` samples from the whole distribution.
+            It can be used in conjunction with `top_k` and `temperature` with the following order
+            of application:
+
+            1. `top_k` sampling
+            2. `temperature` scaling
+            3. `top_p` sampling
+
+            For more details, see https://arxiv.org/abs/1904.09751
+            or https://huyenchip.com/2024/01/16/sampling.html#top_p
         eos_id: If specified, stop generating any more token once the <eos> token is triggered.
     """
     T = prompt.size(0)
@@ -78,11 +117,13 @@ def generate(
     tokens = [prompt]
     input_pos = torch.tensor([T], device=device)
     token = next_token(
-        model, torch.arange(0, T, device=device), prompt.view(1, -1), temperature=temperature, top_k=top_k
+        model, torch.arange(0, T, device=device), prompt.view(1, -1), temperature=temperature, top_k=top_k, top_p=top_p
     ).clone()
     tokens.append(token)
     for _ in range(2, max_returned_tokens - T + 1):
-        token = next_token(model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k).clone()
+        token = next_token(
+            model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k, top_p=top_p
+        ).clone()
         tokens.append(token)
         if token == eos_id:
             break
@@ -97,6 +138,7 @@ def main(
     num_samples: int = 1,
     max_new_tokens: int = 50,
     top_k: Optional[int] = 50,
+    top_p: float = 1.0,
     temperature: float = 0.8,
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
     quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
@@ -110,6 +152,20 @@ def main(
         num_samples: The number of text samples to generate.
         max_new_tokens: The number of generation steps to take.
         top_k: The number of top most probable tokens to consider in the sampling process.
+        top_p: If specified, it represents the cumulative probability threshold to consider in the sampling process.
+            In top-p sampling, the next token is sampled from the highest probability tokens
+            whose cumulative probability exceeds the threshold `top_p`. When specified,
+            it must be `0 <= top_p <= 1`. Here, `top_p=0` is equivalent
+            to sampling the most probable token, while `top_p=1` samples from the whole distribution.
+            It can be used in conjunction with `top_k` and `temperature` with the following order
+            of application:
+
+            1. `top_k` sampling
+            2. `temperature` scaling
+            3. `top_p` sampling
+
+            For more details, see https://arxiv.org/abs/1904.09751
+            or https://huyenchip.com/2024/01/16/sampling.html#top_p
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
             samples.
         checkpoint_dir: The checkpoint directory to load.
@@ -175,7 +231,7 @@ def main(
     L.seed_everything(1234)
     for i in range(num_samples):
         t0 = time.perf_counter()
-        y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
+        y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, top_p=top_p, eos_id=tokenizer.eos_id)
         t = time.perf_counter() - t0
         for block in model.transformer.h:
             block.attn.kv_cache.reset_parameters()
diff --git a/litgpt/generate/full.py b/litgpt/generate/full.py
index c570e8dd2e..56a4e7975d 100644
--- a/litgpt/generate/full.py
+++ b/litgpt/generate/full.py
@@ -23,6 +23,7 @@ def main(
     quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
     max_new_tokens: int = 100,
     top_k: Optional[int] = 50,
+    top_p: float = 1.0,
     temperature: float = 0.8,
     precision: Optional[str] = None,
 ) -> None:
@@ -41,6 +42,20 @@ def main(
             for more details, see https://github.com/Lightning-AI/litgpt/blob/main/tutorials/quantize.md
         max_new_tokens: The number of generation steps to take.
         top_k: The number of top most probable tokens to consider in the sampling process.
+        top_p: If specified, it represents the cumulative probability threshold to consider in the sampling process.
+            In top-p sampling, the next token is sampled from the highest probability tokens
+            whose cumulative probability exceeds the threshold `top_p`. When specified,
+            it must be `0 <= top_p <= 1`. Here, `top_p=0` is equivalent
+            to sampling the most probable token, while `top_p=1` samples from the whole distribution.
+            It can be used in conjunction with `top_k` and `temperature` with the following order
+            of application:
+
+            1. `top_k` sampling
+            2. `temperature` scaling
+            3. `top_p` sampling
+
+            For more details, see https://arxiv.org/abs/1904.09751
+            or https://huyenchip.com/2024/01/16/sampling.html#top_p
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
             samples.
         precision: Indicates the Fabric precision setting to use.
@@ -93,7 +108,7 @@ def main(
 
     L.seed_everything(1234)
     t0 = time.perf_counter()
-    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
+    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, top_p=top_p, eos_id=tokenizer.eos_id)
     t = time.perf_counter() - t0
 
     output = tokenizer.decode(y)
diff --git a/litgpt/generate/sequentially.py b/litgpt/generate/sequentially.py
index 9f006ab47f..1d1908d088 100644
--- a/litgpt/generate/sequentially.py
+++ b/litgpt/generate/sequentially.py
@@ -117,6 +117,7 @@ def main(
     num_samples: int = 1,
     max_new_tokens: int = 50,
     top_k: Optional[int] = 50,
+    top_p: float = 1.0,
     temperature: float = 0.8,
     checkpoint_dir: Path = Path("checkpoints/mistralai/Mistral-7B-Instruct-v0.1"),
     quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"]] = None,
@@ -130,6 +131,20 @@ def main(
         num_samples: The number of text samples to generate.
         max_new_tokens: The number of generation steps to take.
         top_k: The number of top most probable tokens to consider in the sampling process.
+        top_p: If specified, it represents the cumulative probability threshold to consider in the sampling process.
+            In top-p sampling, the next token is sampled from the highest probability tokens
+            whose cumulative probability exceeds the threshold `top_p`. When specified,
+            it must be `0 <= top_p <= 1`. Here, `top_p=0` is equivalent
+            to sampling the most probable token, while `top_p=1` samples from the whole distribution.
+            It can be used in conjunction with `top_k` and `temperature` with the following order
+            of application:
+
+            1. `top_k` sampling
+            2. `temperature` scaling
+            3. `top_p` sampling
+
+            For more details, see https://arxiv.org/abs/1904.09751
+            or https://huyenchip.com/2024/01/16/sampling.html#top_p
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
             samples.
         checkpoint_dir: The checkpoint directory to load.
@@ -206,7 +221,7 @@ def main(
     for i in range(num_samples):
         t0 = time.perf_counter()
         y = generate_base.generate(
-            model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id
+            model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, top_p=top_p, eos_id=tokenizer.eos_id
         )
         t = time.perf_counter() - t0
         for block in model.transformer.h:
diff --git a/litgpt/generate/tp.py b/litgpt/generate/tp.py
index 41492f75b2..39d6ac1065 100644
--- a/litgpt/generate/tp.py
+++ b/litgpt/generate/tp.py
@@ -95,6 +95,7 @@ def main(
     num_samples: int = 1,
     max_new_tokens: int = 50,
     top_k: Optional[int] = 50,
+    top_p: float = 1.0,
     temperature: float = 0.8,
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
     quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"]] = None,
@@ -108,6 +109,20 @@ def main(
         num_samples: The number of text samples to generate.
         max_new_tokens: The number of generation steps to take.
         top_k: The number of top most probable tokens to consider in the sampling process.
+        top_p: If specified, it represents the cumulative probability threshold to consider in the sampling process.
+            In top-p sampling, the next token is sampled from the highest probability tokens
+            whose cumulative probability exceeds the threshold `top_p`. When specified,
+            it must be `0 <= top_p <= 1`. Here, `top_p=0` is equivalent
+            to sampling the most probable token, while `top_p=1` samples from the whole distribution.
+            It can be used in conjunction with `top_k` and `temperature` with the following order
+            of application:
+
+            1. `top_k` sampling
+            2. `temperature` scaling
+            3. `top_p` sampling
+
+            For more details, see https://arxiv.org/abs/1904.09751
+            or https://huyenchip.com/2024/01/16/sampling.html#top_p
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
             samples.
         checkpoint_dir: The checkpoint directory to load.
diff --git a/tests/test_chat.py b/tests/test_chat.py
index 3f456e7421..13b432897d 100644
--- a/tests/test_chat.py
+++ b/tests/test_chat.py
@@ -110,13 +110,13 @@ def test_main(mocked_input, stop_iteration, fake_checkpoint_dir, monkeypatch, te
 
     out, err = StringIO(), StringIO()
     with redirect_stdout(out), redirect_stderr(err):
-        chat.main(temperature=2.0, top_k=2, checkpoint_dir=fake_checkpoint_dir)
+        chat.main(temperature=2.0, top_k=2, top_p=0.9, checkpoint_dir=fake_checkpoint_dir)
 
     # decoding is done per each generated item
     assert len(tokenizer_mock.return_value.decode.mock_calls) == generate_mock.return_value.numel()
     assert torch.allclose(tokenizer_mock.return_value.decode.call_args[0][0], generate_mock.return_value)
     assert generate_mock.mock_calls == [
-        call(ANY, tensor_like, 128, temperature=2.0, top_k=2, stop_tokens=([tokenizer_mock.return_value.eos_id],))
+        call(ANY, tensor_like, 128, temperature=2.0, top_k=2, top_p=0.9, stop_tokens=([tokenizer_mock.return_value.eos_id],))
     ]
     # only the generated result is printed to stdout
     assert re.match("Now chatting with Llama 3.*>> .*Reply: foo bar baz", out.getvalue(), re.DOTALL)
diff --git a/tests/test_generate.py b/tests/test_generate.py
index 4c317639c4..5f950ddcfa 100644
--- a/tests/test_generate.py
+++ b/tests/test_generate.py
@@ -68,13 +68,13 @@ def test_main(fake_checkpoint_dir, monkeypatch, tensor_like):
     num_samples = 2
     out, err = StringIO(), StringIO()
     with redirect_stdout(out), redirect_stderr(err):
-        generate.main(temperature=2.0, top_k=2, num_samples=num_samples, checkpoint_dir=fake_checkpoint_dir)
+        generate.main(temperature=2.0, top_k=2, top_p=0.9, num_samples=num_samples, checkpoint_dir=fake_checkpoint_dir)
 
     assert len(tokenizer_mock.return_value.decode.mock_calls) == num_samples
     assert torch.allclose(tokenizer_mock.return_value.decode.call_args[0][0], generate_mock.return_value)
     assert (
         generate_mock.mock_calls
-        == [call(ANY, tensor_like, 53, temperature=2.0, top_k=2, eos_id=tokenizer_mock.return_value.eos_id)]
+        == [call(ANY, tensor_like, 53, temperature=2.0, top_k=2, top_p=0.9, eos_id=tokenizer_mock.return_value.eos_id)]
         * num_samples
     )
     # only the generated result is printed to stdout
@@ -102,10 +102,26 @@ def test_sample(temperature):
         [
             [[24, 4, 98, 77, 47], [65, 70, 32, 67, 24], [92, 32, 88, 36, 62]],
             [[85, 79, 57, 68, 50], [89, 46, 72, 45, 32], [68, 96, 68, 24, 36]],
-        ]
+        ],
+        dtype=torch.float32,
     )
-    token = sample(logits, temperature=temperature)
+    token = sample(logits, temperature=temperature, top_p=0.8)
 
     assert token.shape == (1,)
     # sample is batch size 1 only for now - this should be [0, 1] once batched generation is supported
     assert token.tolist() == [0]
+
+
+def test_generate_different_results_with_different_top_p():
+    config = Config(block_size=128, vocab_size=16, n_layer=1, n_head=4, n_embd=8)
+    model = GPT(config)
+    model.max_seq_length = 50
+    model.set_kv_cache(batch_size=1)
+
+    torch.manual_seed(123)
+    input_idx = torch.randint(10, size=(1,))
+
+    output1 = generate.generate(model, input_idx, 20, top_p=1.0)
+    output2 = generate.generate(model, input_idx, 20, top_p=0.1)
+
+    assert not torch.equal(output1, output2)
diff --git a/tests/test_generate_adapter.py b/tests/test_generate_adapter.py
index 7b089e833e..e977b0a93b 100644
--- a/tests/test_generate_adapter.py
+++ b/tests/test_generate_adapter.py
@@ -36,11 +36,11 @@ def test_main(fake_checkpoint_dir, monkeypatch, version, tensor_like):
     num_samples = 1
     out, err = StringIO(), StringIO()
     with redirect_stdout(out), redirect_stderr(err):
-        generate.main(temperature=2.0, top_k=2, checkpoint_dir=fake_checkpoint_dir)
+        generate.main(temperature=2.0, top_k=2, top_p=0.9, checkpoint_dir=fake_checkpoint_dir)
 
     assert len(tokenizer_mock.return_value.decode.mock_calls) == num_samples
     assert torch.allclose(tokenizer_mock.return_value.decode.call_args[0][0], generate_mock.return_value)
-    assert generate_mock.mock_calls == [call(ANY, tensor_like, 101, temperature=2.0, top_k=2, eos_id=ANY)] * num_samples
+    assert generate_mock.mock_calls == [call(ANY, tensor_like, 101, temperature=2.0, top_k=2, top_p=0.9, eos_id=ANY)] * num_samples
     # only the generated result is printed to stdout
     assert out.getvalue() == "foo bar baz\n" * num_samples
 

From e441c658439df570b0d9e1deda63dcfdaa1146bc Mon Sep 17 00:00:00 2001
From: Dev Khant <devkhant24@gmail.com>
Date: Fri, 3 May 2024 20:38:50 +0530
Subject: [PATCH 15/21] Add H2O Danube2 Checkpoint (#1282)

Co-authored-by: Luca Antiga <luca@lightning.ai>
Co-authored-by: Sebastian Raschka <mail@sebastianraschka.com>
Co-authored-by: Andrei-Aksionov <aksionau.andrei@gmail.com>
Co-authored-by: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com>
---
 README.md                           |  1 +
 litgpt/config.py                    | 29 ++++++++++++++-
 litgpt/prompts.py                   |  8 ++++
 tests/test_model.py                 | 58 +++++++++++++++++++++++++++++
 tutorials/download_model_weights.md | 24 +++++++-----
 5 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 47a761aed4..e4377d5281 100644
--- a/README.md
+++ b/README.md
@@ -71,6 +71,7 @@ LitGPT has 🤯 **custom, from-scratch implementations** of [20+ LLMs](tutorials
 |----|----|----|----|
 | CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) |
 | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) |
+| Danube2 | 1.8B | H2O.ai | [H2O.ai](https://h2o.ai/platform/danube-1-8b/) |
 | Dolly | 3B, 7B, 12B | Databricks | [Conover et al. 2023](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) |
 | Falcon | 7B, 40B, 180B | TII UAE | [TII 2023](https://falconllm.tii.ae)                                                                                         |
 | FreeWilly2 (Stable Beluga 2) | 70B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models)                             |
diff --git a/litgpt/config.py b/litgpt/config.py
index e03fa8ae34..0f35d43eec 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -836,7 +836,7 @@ def norm_class(self) -> Type:
         copy["name"] = c["name"].format(kind)
         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
         configs.append(copy)
-        
+
 
 ###############
 # Meta LLaMA 3
@@ -964,6 +964,33 @@ def norm_class(self) -> Type:
 ]
 configs.extend(codegemma)
 
+################
+# H2Oai Danube2
+################
+danube2 = [
+    # https://huggingface.co/h2oai/h2o-danube2-1.8b-chat/blob/main/config.json
+    dict(
+        name="Danube2-1.8b-chat",
+        hf_config=dict(org="h2oai", name="h2o-danube2-1.8b-chat"),
+        vocab_size=32000,
+        n_layer=24,
+        n_head=32,
+        n_embd=2560,
+        block_size=4096,  # should be 8192 but sliding_window mechanism is not implemented
+        intermediate_size=6912,
+        padding_multiple=64,
+        norm_eps=1e-05,
+        rope_base=10000,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+    )
+]
+configs.extend(danube2)
+
 
 ##########################
 # Stability AI FreeWilly2
diff --git a/litgpt/prompts.py b/litgpt/prompts.py
index 04a0551cd1..a0e515c3f8 100644
--- a/litgpt/prompts.py
+++ b/litgpt/prompts.py
@@ -288,6 +288,11 @@ def apply(self, prompt: str, **kwargs: str) -> str:
         return f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
 
 
+class H2Oai(PromptStyle):
+    def apply(self, prompt: str, **kwargs: str) -> str:
+        return f"<|prompt|>{prompt}</s><|answer|>"
+
+
 # Maps prompt style names to PromptStyle classes
 prompt_styles: Dict[str, Type[PromptStyle]] = {
     # Dataset-specific prompt styles
@@ -312,6 +317,7 @@ def apply(self, prompt: str, **kwargs: str) -> str:
     "phi-2": Phi2,
     "tinyllama": TinyLlama,
     "gemma": Gemma,
+    "h2oai": H2Oai,
 }
 
 
@@ -352,6 +358,8 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle:
         return TinyLlama()
     if re.search(r"(Code)?Gemma.*-it", model_name):
         return Gemma()
+    if re.search("Danube2.*-chat", model_name):
+        return H2Oai()
     return Default()
 
 
diff --git a/tests/test_model.py b/tests/test_model.py
index 7743c4f143..49584aeb87 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -417,6 +417,64 @@ def test_against_hf_mixtral():
     torch.testing.assert_close(ours_y, theirs_y)
 
 
+@torch.inference_mode()
+@pytest.mark.parametrize(
+    ("device", "dtype"),
+    [
+        (torch.device("cpu"), torch.float32),
+        pytest.param(
+            torch.device("cuda"),
+            torch.float16,
+            marks=[
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                RunIf(min_cuda_gpus=1),
+            ],
+        ),
+    ],
+)
+def test_against_hf_h2o_danube(device, dtype):
+    torch.set_default_dtype(dtype)
+
+    ours_config = Config.from_name(
+        "Danube2-1.8b-chat",
+        padded_vocab_size=10000,
+        n_layer=2,
+        n_embd=16,
+        n_head=8,
+        n_query_groups=2,
+        intermediate_size=43,
+    )
+    T = 5
+    theirs_config = MistralConfig(
+        vocab_size=ours_config.padded_vocab_size,
+        hidden_size=ours_config.n_embd,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=T,
+        rms_norm_eps=ours_config.norm_eps,
+        num_key_value_heads=ours_config.n_query_groups,
+        rope_theta=ours_config.rope_base,
+    )
+    assert ours_config.intermediate_size == theirs_config.intermediate_size
+
+    theirs_model = MistralForCausalLM(theirs_config).to(device)
+    theirs_state_dict = theirs_model.state_dict()
+    state_dict = {}
+    copy_weights_hf_llama(ours_config, {}, state_dict, theirs_state_dict)
+    ours_model = GPT(ours_config).to(device)
+    ours_model.load_state_dict(state_dict)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
 @torch.inference_mode()
 @pytest.mark.parametrize(
     ("device", "dtype"),
diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
index 1938ffe768..2775e1e578 100644
--- a/tutorials/download_model_weights.md
+++ b/tutorials/download_model_weights.md
@@ -2,11 +2,11 @@
 
 LitGPT supports a variety of LLM architectures with publicly available weights. You can download model weights and access a list of supported models using the LitGPT `download.py` script.
 
-
 | Model                                        | Model size                              | Reference                                                                                                                |
 |----------------------------------------------|-----------------------------------------|--------------------------------------------------------------------------------------------------------------------------|
 | CodeGemma by Google                          | 7B                                      | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma)                                                             |
 | Code Llama by Meta AI                        | 7B, 13B, 34B, 70B                       | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950)                                                                  |
+| Danube2 by H2O.ai                            | 1.8B                                    | [H2O.ai](https://h2o.ai/platform/danube-1-8b/)
 | Dolly by Databricks                          | 3B, 7B, 12B                             | [Conover et al. 2023](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) |
 | Falcon by TII UAE                            | 7B, 40B, 180B                           | [TII 2023](https://falconllm.tii.ae)                                                                                     |
 | FreeWilly2 (Stable Beluga 2) by Stability AI | 70B                                     | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models)                         |
@@ -28,11 +28,9 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | TinyLlama by Zhang et al.                    | 1.1B                                    | [Zhang et al. 2023](https://github.com/jzhang38/TinyLlama)                                                               |
 | Vicuna by LMSYS                              | 7B, 13B, 33B                            | [Li et al. 2023](https://lmsys.org/blog/2023-03-30-vicuna/)                                                              |
 
-
-
 &nbsp;
-## General Instructions
 
+## General Instructions
 
 ### 1. List Available Models
 
@@ -91,6 +89,7 @@ google/gemma-2b
 google/gemma-2b-it
 google/gemma-7b
 google/gemma-7b-it
+h2oai/h2o-danube2-1.8b-chat
 lmsys/longchat-13b-16k
 lmsys/longchat-7b-16k
 lmsys/vicuna-13b-v1.3
@@ -165,14 +164,15 @@ unsloth/Mistral-7B-v0.2
 
 > [!NOTE]
 > If you want to adopt a model variant that is not listed in the table above but has a similar architecture as one of the supported models, you can use this model by by using the `--model_name` argument as shown below:
+>
 > ```bash
 > litgpt download \
 >  --repo_id NousResearch/Hermes-2-Pro-Mistral-7B \
 >  --model_name Mistral-7B-v0.1
 > ```
 
-
 &nbsp;
+
 ### 2. Download Model Weights
 
 To download the weights for a specific model, use the `--repo_id` argument. Replace `<repo_id>` with the model's repository ID. For example:
@@ -180,11 +180,12 @@ To download the weights for a specific model, use the `--repo_id` argument. Repl
 ```bash
 litgpt download --repo_id <repo_id>
 ```
+
 This command downloads the model checkpoint into the `checkpoints/` directory.
 
 &nbsp;
-### 3. Additional Help
 
+### 3. Additional Help
 
 For more options, add the `--help` flag when running the script:
 
@@ -193,6 +194,7 @@ litgpt download --help
 ```
 
 &nbsp;
+
 ### 4. Run the Model
 
 After conversion, run the model with the `--checkpoint_dir` flag, adjusting `repo_id` accordingly:
@@ -202,6 +204,7 @@ litgpt chat --checkpoint_dir checkpoints/<repo_id>
 ```
 
 &nbsp;
+
 ## Tinyllama Example
 
 This section shows a typical end-to-end example for downloading and using TinyLlama:
@@ -235,7 +238,7 @@ litgpt chat --checkpoint_dir checkpoints/$repo_id
 
 Note that certain models require that you've been granted access to the weights on the Hugging Face Hub.
 
-For example, to get access to the Gemma 2B model, you can do so by following the steps at https://huggingface.co/google/gemma-2b. After access is granted, you can find your HF hub token in https://huggingface.co/settings/tokens.
+For example, to get access to the Gemma 2B model, you can do so by following the steps at <https://huggingface.co/google/gemma-2b>. After access is granted, you can find your HF hub token in <https://huggingface.co/settings/tokens>.
 
 Once you've been granted access and obtained the access token you need to pass the additional `--access_token`:
 
@@ -246,7 +249,8 @@ litgpt download \
 ```
 
 &nbsp;
-## Finetunes and other model variants
+
+## Finetunes and Other Model Variants
 
 Sometimes you want to download the weights of a finetune of one of the models listed above. To do this, you need to manually specify the `model_name` associated to the config to use. For example:
 
@@ -257,11 +261,11 @@ litgpt download \
 ```
 
 &nbsp;
+
 ## Tips for GPU Memory Limitations
 
 The `download.py` script will automatically convert the downloaded model checkpoint into a LitGPT-compatible format. In case this conversion fails due to GPU memory constraints, you can try to reduce the memory requirements by passing the  `--dtype bf16-true` flag to convert all parameters into this smaller precision (however, note that most model weights are already in a bfloat16 format, so it may not have any effect):
 
-
 ```bash
 litgpt download \
   --repo_id <repo_id>
@@ -271,6 +275,7 @@ litgpt download \
 (If your GPU does not support the bfloat16 format, you can also try a regular 16-bit float format via `--dtype 16-true`.)
 
 &nbsp;
+
 ## Converting Checkpoints Manually
 
 For development purposes, for example, when adding or experimenting with new model configurations, it may be beneficial to split the weight download and model conversion into two separate steps.
@@ -291,6 +296,7 @@ litgpt convert to_litgpt \
 ```
 
 &nbsp;
+
 ## Downloading Tokenizers Only
 
 In some cases we don't need the model weight, for example, when we are pretraining a model from scratch instead of finetuning it. For cases like this, you can use the `--tokenizer_only` flag to only download a model's tokenizer, which can then be used in the pretraining scripts:

From f3343784bbd192490e2a70aa5ef75c52608b1d35 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Fri, 3 May 2024 10:23:11 -0500
Subject: [PATCH 16/21] Test readme commands (#1311)

---
 pyproject.toml       |   1 +
 tests/test_readme.py | 170 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 tests/test_readme.py

diff --git a/pyproject.toml b/pyproject.toml
index f9a2dea996..ab7b9b26f3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ test = [
     "pytest>=8.1.1",
     "pytest-rerunfailures>=14.0",
     "pytest-timeout>=2.3.1",
+    "pytest-dependency>=0.6.0",
     "transformers>=4.38.0",  # numerical comparisons
     "einops>=0.7.0",
     "protobuf>=4.23.4",
diff --git a/tests/test_readme.py b/tests/test_readme.py
new file mode 100644
index 0000000000..dd05d5ec17
--- /dev/null
+++ b/tests/test_readme.py
@@ -0,0 +1,170 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+from pathlib import Path
+import os
+import pytest
+import requests
+import subprocess
+import sys
+import threading
+import time
+
+
+REPO_ID = Path("EleutherAI/pythia-14m")
+CUSTOM_TEXTS_DIR = Path("custom_texts")
+
+
+def run_command(command):
+    try:
+        result = subprocess.run(command, capture_output=True, text=True, check=True)
+        return result.stdout
+    except subprocess.CalledProcessError as e:
+        error_message = (
+            f"Command '{' '.join(command)}' failed with exit status {e.returncode}\n"
+            f"Output:\n{e.stdout}\n"
+            f"Error:\n{e.stderr}"
+        )
+        # You can either print the message, log it, or raise an exception with it
+        print(error_message)
+        raise RuntimeError(error_message) from None
+
+
+@pytest.mark.skipif(
+    sys.platform.startswith("win") or
+    sys.platform == "darwin" or
+    'AGENT_NAME' in os.environ,
+    reason="Does not run on Windows, macOS, or Azure Pipelines"
+)
+@pytest.mark.dependency()
+def test_download_model():
+    repo_id = str(REPO_ID).replace("\\", "/")  # fix for Windows CI
+    command = ["litgpt", "download", "--repo_id", str(repo_id)]
+    output = run_command(command)
+
+    s = Path("checkpoints") / repo_id
+    assert f"Saving converted checkpoint to {str(s)}" in output
+    assert ("checkpoints" / REPO_ID).exists()
+
+
+@pytest.mark.dependency()
+def test_download_books():
+    CUSTOM_TEXTS_DIR.mkdir(parents=True, exist_ok=True)
+
+    books = [
+        ("https://www.gutenberg.org/cache/epub/24440/pg24440.txt", "book1.txt"),
+        ("https://www.gutenberg.org/cache/epub/26393/pg26393.txt", "book2.txt")
+    ]
+    for url, filename in books:
+        subprocess.run(["curl", url, "--output", str(CUSTOM_TEXTS_DIR / filename)], check=True)
+        # Verify each book is downloaded
+        assert (CUSTOM_TEXTS_DIR / filename).exists(), f"{filename} not downloaded"
+
+
+@pytest.mark.dependency(depends=["test_download_model"])
+def test_chat_with_model():
+    command = ["litgpt", "generate", "base", "--checkpoint_dir", f"checkpoints"/REPO_ID]
+    prompt = "What do Llamas eat?"
+    result = subprocess.run(command, input=prompt, text=True, capture_output=True, check=True)
+    assert "What food do llamas eat?" in result.stdout
+
+
+@pytest.mark.dependency(depends=["test_download_model"])
+@pytest.mark.timeout(300)
+def test_finetune_model():
+
+    OUT_DIR = Path("out") / "lora"
+    DATASET_PATH = Path("custom_finetuning_dataset.json")
+    CHECKPOINT_DIR = "checkpoints" / REPO_ID
+
+    download_command = ["curl", "-L", "https://huggingface.co/datasets/medalpaca/medical_meadow_health_advice/raw/main/medical_meadow_health_advice.json", "-o", str(DATASET_PATH)]
+    subprocess.run(download_command, check=True)
+
+    assert DATASET_PATH.exists(), "Dataset file not downloaded"
+
+    finetune_command = [
+        "litgpt", "finetune", "lora",
+        "--checkpoint_dir", str(CHECKPOINT_DIR),
+        "--lora_r", "1",
+        "--data", "JSON",
+        "--data.json_path", str(DATASET_PATH),
+        "--data.val_split_fraction", "0.00001",  # Keep small because new final validation is expensive
+        "--train.max_steps", "1",
+        "--out_dir", str(OUT_DIR)
+    ]
+    run_command(finetune_command)
+
+    assert (OUT_DIR/"final").exists(), "Finetuning output directory was not created"
+    assert (OUT_DIR/"final"/"lit_model.pth").exists(), "Model file was not created"
+
+
+@pytest.mark.dependency(depends=["test_download_model", "test_download_books"])
+def test_pretrain_model():
+    OUT_DIR = Path("out") / "custom_pretrained"
+    pretrain_command = [
+        "litgpt", "pretrain",
+        "--model_name", "pythia-14m",
+        "--tokenizer_dir", str("checkpoints" / REPO_ID),
+        "--data", "TextFiles",
+        "--data.train_data_path", str(CUSTOM_TEXTS_DIR),
+        "--train.max_tokens", "100",     # to accelerate things for CI
+        "--eval.max_iters", "1",         # to accelerate things for CI
+        "--out_dir", str(OUT_DIR)
+    ]
+    run_command(pretrain_command)
+
+    assert (OUT_DIR / "final").exists(), "Pretraining output directory was not created"
+    assert (OUT_DIR / "final" / "lit_model.pth").exists(), "Model file was not created"
+
+
+@pytest.mark.dependency(depends=["test_download_model", "test_download_books"])
+def test_continue_pretrain_model():
+    OUT_DIR = Path("out") / "custom_continue_pretrained"
+    pretrain_command = [
+        "litgpt", "pretrain",
+        "--model_name", "pythia-14m",
+        "--initial_checkpoint", str("checkpoints" / REPO_ID),
+        "--tokenizer_dir", str("checkpoints" / REPO_ID),
+        "--data", "TextFiles",
+        "--data.train_data_path", str(CUSTOM_TEXTS_DIR),
+        "--train.max_tokens", "100",     # to accelerate things for CI
+        "--eval.max_iters", "1",         # to accelerate things for CI
+        "--out_dir", str(OUT_DIR)
+    ]
+    run_command(pretrain_command)
+
+    assert (OUT_DIR / "final").exists(), "Continued pretraining output directory was not created"
+    assert (OUT_DIR / "final" / "lit_model.pth").exists(), "Model file was not created"
+
+
+@pytest.mark.dependency(depends=["test_download_model"])
+def test_serve():
+    CHECKPOINT_DIR = str("checkpoints" / REPO_ID)
+    run_command = [
+        "litgpt", "serve",
+        "--checkpoint_dir", str(CHECKPOINT_DIR)
+    ]
+
+    process = None
+
+    def run_server():
+        nonlocal process
+        try:
+            process = subprocess.Popen(run_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+            stdout, stderr = process.communicate(timeout=60)
+        except subprocess.TimeoutExpired:
+            print('Server start-up timeout expired')
+
+    server_thread = threading.Thread(target=run_server)
+    server_thread.start()
+
+    # Allow time to initialize and start serving
+    time.sleep(30)
+
+    try:
+        response = requests.get("http://127.0.0.1:8000")
+        print(response.status_code)
+        assert response.status_code == 200, "Server did not respond as expected."
+    finally:
+        if process:
+            process.kill()
+        server_thread.join()

From b1a43cd264f0f7eec8d429df1413d0a027f94f92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Mon, 6 May 2024 11:51:46 +0200
Subject: [PATCH 17/21] Fix top-p test (#1389)

---
 tests/test_generate.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_generate.py b/tests/test_generate.py
index 5f950ddcfa..7cd0dca9db 100644
--- a/tests/test_generate.py
+++ b/tests/test_generate.py
@@ -121,7 +121,9 @@ def test_generate_different_results_with_different_top_p():
     torch.manual_seed(123)
     input_idx = torch.randint(10, size=(1,))
 
+    torch.manual_seed(123)
     output1 = generate.generate(model, input_idx, 20, top_p=1.0)
+    torch.manual_seed(123)
     output2 = generate.generate(model, input_idx, 20, top_p=0.1)
 
     assert not torch.equal(output1, output2)

From 6fd737d3da240a67f4acb7a3ce733fa2e67538a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Mon, 6 May 2024 17:46:46 +0200
Subject: [PATCH 18/21] Unskip Thunder FSDP test (#1391)

---
 .github/azure-gpu-test.yml           |  4 ++--
 tests/__init__.py                    |  7 +++++++
 tests/conftest.py                    | 13 ++++++++-----
 tests/test_adapter.py                |  2 +-
 tests/test_adapter_v2.py             |  2 +-
 tests/test_ci.py                     |  2 +-
 tests/test_convert_lit_checkpoint.py |  4 +---
 tests/test_generate_sequentially.py  |  2 +-
 tests/test_generate_tp.py            |  4 ++--
 tests/test_lora.py                   |  4 ++--
 tests/test_model.py                  |  4 +---
 tests/test_pretrain.py               |  4 +---
 tests/test_prompts.py                |  2 +-
 tests/test_thunder_ddp.py            |  2 +-
 tests/test_thunder_fsdp.py           |  4 +---
 tests/test_thunder_pretrain.py       |  2 +-
 tests/test_unsloth_executor.py       |  2 +-
 tests/test_utils.py                  |  2 +-
 18 files changed, 34 insertions(+), 32 deletions(-)
 create mode 100644 tests/__init__.py

diff --git a/.github/azure-gpu-test.yml b/.github/azure-gpu-test.yml
index b9b5b32c35..b5a5c9889e 100644
--- a/.github/azure-gpu-test.yml
+++ b/.github/azure-gpu-test.yml
@@ -13,7 +13,7 @@ pr:
 
 jobs:
   - job: testing
-    timeoutInMinutes: "20"
+    timeoutInMinutes: "30"
     cancelTimeoutInMinutes: "2"
     pool: "lit-rtx-3090"
     variables:
@@ -67,4 +67,4 @@ jobs:
       env:
         PL_RUN_CUDA_TESTS: "1"
       displayName: "Standalone tests"
-      timeoutInMinutes: "5"
+      timeoutInMinutes: "10"
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000000..2f22d66b14
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1,7 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import warnings
+
+import pytest
+
+warnings.filterwarnings("ignore", category=pytest.PytestWarning, message=r".*\(rm_rf\) error removing.*")
diff --git a/tests/conftest.py b/tests/conftest.py
index fdfe2295eb..fa22e514c0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,7 +2,6 @@
 
 import os
 import shutil
-import warnings
 from pathlib import Path
 from typing import List, Optional
 
@@ -50,6 +49,14 @@ def restore_default_dtype():
     torch.set_default_dtype(torch.float32)
 
 
+@pytest.fixture(autouse=True)
+def destroy_process_group():
+    import torch.distributed
+
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
+
+
 class MockTokenizer:
     """A dummy tokenizer that encodes each character as its ASCII code."""
 
@@ -149,7 +156,3 @@ def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.C
             bold=True,
             purple=True,  # oh yeah, branded pytest messages
         )
-
-
-# Ignore cleanup warnings from pytest (rarely happens due to a race condition when executing pytest in parallel)
-warnings.filterwarnings("ignore", category=pytest.PytestWarning, message=r".*\(rm_rf\) error removing.*")
diff --git a/tests/test_adapter.py b/tests/test_adapter.py
index 2028a78b83..9e724ab0e8 100644
--- a/tests/test_adapter.py
+++ b/tests/test_adapter.py
@@ -9,7 +9,7 @@
 import pytest
 import torch
 import yaml
-from conftest import RunIf
+from tests.conftest import RunIf
 from lightning import Fabric
 from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE, BitsandbytesPrecision
 from lightning.fabric.wrappers import _FabricOptimizer
diff --git a/tests/test_adapter_v2.py b/tests/test_adapter_v2.py
index 33f00a3166..5f63697e9e 100644
--- a/tests/test_adapter_v2.py
+++ b/tests/test_adapter_v2.py
@@ -8,7 +8,7 @@
 import pytest
 import torch
 import yaml
-from conftest import RunIf
+from tests.conftest import RunIf
 from lightning import Fabric
 from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE, BitsandbytesPrecision
 from lightning.fabric.wrappers import _FabricOptimizer
diff --git a/tests/test_ci.py b/tests/test_ci.py
index d553b53e16..e1db31aeaf 100644
--- a/tests/test_ci.py
+++ b/tests/test_ci.py
@@ -1,6 +1,6 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 
-from conftest import RunIf
+from tests.conftest import RunIf
 from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE
 
 
diff --git a/tests/test_convert_lit_checkpoint.py b/tests/test_convert_lit_checkpoint.py
index ca4ee9881e..e5b1b889c0 100644
--- a/tests/test_convert_lit_checkpoint.py
+++ b/tests/test_convert_lit_checkpoint.py
@@ -2,14 +2,11 @@
 
 import os
 from dataclasses import asdict
-from pathlib import Path
 from unittest.mock import ANY
-from urllib.request import urlretrieve
 
 import pytest
 import torch
 import yaml
-from conftest import RunIf
 from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.models.falcon import FalconConfig, FalconForCausalLM
 from transformers.models.gemma import GemmaConfig, GemmaForCausalLM
@@ -27,6 +24,7 @@
     copy_weights_phi,
     qkv_split,
 )
+from tests.conftest import RunIf
 
 
 def test_convert_lit_checkpoint(tmp_path):
diff --git a/tests/test_generate_sequentially.py b/tests/test_generate_sequentially.py
index 4bc3665f97..b0bed4797e 100644
--- a/tests/test_generate_sequentially.py
+++ b/tests/test_generate_sequentially.py
@@ -11,7 +11,7 @@
 import pytest
 import torch
 import yaml
-from conftest import RunIf
+from tests.conftest import RunIf
 from lightning import Fabric
 
 from litgpt import Config
diff --git a/tests/test_generate_tp.py b/tests/test_generate_tp.py
index eb0505219c..039dd0ea4b 100644
--- a/tests/test_generate_tp.py
+++ b/tests/test_generate_tp.py
@@ -7,12 +7,12 @@
 import pytest
 import torch
 import yaml
-from conftest import RunIf
-from test_generate_sequentially import find_forward_hooks
 
 from litgpt import GPT, Config
 from litgpt.generate.tp import tensor_parallel, tensor_parallel_linear
 from litgpt.scripts.download import download_from_hub
+from tests.conftest import RunIf
+from tests.test_generate_sequentially import find_forward_hooks
 
 
 def test_tensor_parallel_linear():
diff --git a/tests/test_lora.py b/tests/test_lora.py
index d131411d9c..79dc5896a8 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -9,7 +9,6 @@
 import pytest
 import torch
 import yaml
-from conftest import RunIf
 from lightning import Fabric
 from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE, BitsandbytesPrecision
 from lightning.fabric.wrappers import _FabricOptimizer
@@ -22,11 +21,12 @@
 import litgpt.finetune.lora as module
 from litgpt.args import EvalArgs, TrainArgs
 from litgpt.data import Alpaca
-from litgpt.lora import GPT as LoRAGPT
 from litgpt.lora import CausalSelfAttention as LoRACausalSelfAttention
 from litgpt.lora import Config, LoRALinear, LoRAQKVLinear, lora_filter, mark_only_lora_as_trainable, merge_lora_weights
+from litgpt.lora import GPT as LoRAGPT
 from litgpt.model import GPT as BaseGPT
 from litgpt.scripts.convert_hf_checkpoint import copy_weights_hf_llama
+from tests.conftest import RunIf
 
 
 def test_lora_layer_replacement():
diff --git a/tests/test_model.py b/tests/test_model.py
index 49584aeb87..b8b6366fb7 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -2,12 +2,9 @@
 
 from copy import deepcopy
 from functools import partial
-from pathlib import Path
-from urllib.request import urlretrieve
 
 import pytest
 import torch
-from conftest import RunIf
 from lightning import Fabric
 from lightning.fabric.utilities.imports import _IS_WINDOWS
 from lightning.fabric.utilities.init import _materialize_meta_tensors
@@ -37,6 +34,7 @@
     copy_weights_hf_llama,
     copy_weights_phi,
 )
+from tests.conftest import RunIf
 
 
 @torch.inference_mode()
diff --git a/tests/test_pretrain.py b/tests/test_pretrain.py
index d252524e87..9a67b85917 100644
--- a/tests/test_pretrain.py
+++ b/tests/test_pretrain.py
@@ -3,21 +3,19 @@
 import os
 from contextlib import redirect_stdout
 from io import StringIO
-from pathlib import Path
 from unittest import mock
 from unittest.mock import ANY, Mock
 
 import pytest
 import torch
-from conftest import RunIf
 from lightning.fabric.strategies import FSDPStrategy, SingleDeviceStrategy
 from torch.utils.data import DataLoader
 
-from test_utils import test_init_out_dir
 from litgpt import pretrain
 from litgpt.args import EvalArgs, TrainArgs
 from litgpt.config import Config
 from litgpt.pretrain import initialize_weights
+from tests.conftest import RunIf
 
 
 @RunIf(min_cuda_gpus=2, standalone=True)
diff --git a/tests/test_prompts.py b/tests/test_prompts.py
index 20f2c84e0c..206db11c28 100644
--- a/tests/test_prompts.py
+++ b/tests/test_prompts.py
@@ -112,6 +112,6 @@ def test_save_load_prompt_style(tmp_path):
     save_prompt_style(CustomPromptStyle(), checkpoint_dir)
     with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file:
         contents = yaml.safe_load(file)
-    assert contents == {"class_path": "test_prompts.CustomPromptStyle"}
+    assert contents == {"class_path": "tests.test_prompts.CustomPromptStyle"}
     loaded = load_prompt_style(checkpoint_dir)
     assert isinstance(loaded, CustomPromptStyle)
diff --git a/tests/test_thunder_ddp.py b/tests/test_thunder_ddp.py
index 566e883ac3..2dbc208889 100644
--- a/tests/test_thunder_ddp.py
+++ b/tests/test_thunder_ddp.py
@@ -3,7 +3,7 @@
 
 import pytest
 import torch
-from conftest import RunIf
+from tests.conftest import RunIf
 from lightning import Fabric
 
 # support running without installing as a package
diff --git a/tests/test_thunder_fsdp.py b/tests/test_thunder_fsdp.py
index 321cdac7a6..2c97ad626d 100644
--- a/tests/test_thunder_fsdp.py
+++ b/tests/test_thunder_fsdp.py
@@ -6,7 +6,7 @@
 
 import pytest
 import torch
-from conftest import RunIf
+from tests.conftest import RunIf
 from lightning.fabric import Fabric
 from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3
 
@@ -263,8 +263,6 @@ def set_up_planner(self, state_dict, metadata, is_coordinator):
 
 @RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
 def test_save_load_sharded_checkpoint(tmp_path):
-    pytest.skip("Temporarily disabled, often exceeds 5 min timeout")
-
     strategy = ThunderFSDPStrategy(state_dict_type="sharded", broadcast_from=0)
     fabric = Fabric(accelerator="cuda", devices=2, strategy=strategy)
     fabric.launch()
diff --git a/tests/test_thunder_pretrain.py b/tests/test_thunder_pretrain.py
index 358c0d5c59..30f9d71afb 100644
--- a/tests/test_thunder_pretrain.py
+++ b/tests/test_thunder_pretrain.py
@@ -6,7 +6,7 @@
 from unittest.mock import Mock
 
 import torch
-from conftest import RunIf
+from tests.conftest import RunIf
 from torch.utils.data import DataLoader
 
 from litgpt import Config
diff --git a/tests/test_unsloth_executor.py b/tests/test_unsloth_executor.py
index 797d1f6f53..15b1c7c673 100644
--- a/tests/test_unsloth_executor.py
+++ b/tests/test_unsloth_executor.py
@@ -1,10 +1,10 @@
 import pytest
 import torch
-from conftest import RunIf
 
 from litgpt import GPT, Config
 from litgpt.model import apply_rope, build_rope_cache
 from litgpt.utils import chunked_cross_entropy
+from tests.conftest import RunIf
 
 
 @RunIf(min_cuda_gpus=1, thunder=True)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index cbb5230621..9770bf98e7 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -11,7 +11,7 @@
 import torch
 import torch.nn.functional as F
 import yaml
-from conftest import RunIf
+from tests.conftest import RunIf
 from lightning import Fabric
 from lightning.fabric.loggers import CSVLogger, TensorBoardLogger
 from lightning.fabric.plugins import BitsandbytesPrecision

From f84b610948d34b985b0f9693dea8c743429238e2 Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com>
Date: Mon, 6 May 2024 19:11:41 +0300
Subject: [PATCH 19/21] LoRA: `zero_pad` speed improvements (#770)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 litgpt/lora.py     | 63 ++++++++++++++++++++--------------------------
 tests/test_lora.py | 31 ++++++++++++++++++++---
 2 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/litgpt/lora.py b/litgpt/lora.py
index 8fee63cbb6..7c4ae423e0 100644
--- a/litgpt/lora.py
+++ b/litgpt/lora.py
@@ -215,6 +215,7 @@ def __init__(
         """
         super(LoRALinear, self).__init__(r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout)
         self.linear = torch.nn.Linear(in_features, out_features, **kwargs)
+        self.head_size = head_size
         self.n_head = n_head
         self.n_query_groups = n_query_groups
         if isinstance(enable_lora, bool):
@@ -258,30 +259,34 @@ def __init__(
             # https://github.com/cloneofsimo/lora
             self.scaling = self.lora_alpha / self.r
 
-            # Compute the indices
-            # Indices are needed to properly pad weight updates with zeros in `zero_pad` method.
-            q_per_kv = self.n_head // self.n_query_groups
-            total_qkv = q_per_kv + 2
-            head_size = out_features // (self.n_query_groups * total_qkv)
-            ind = range(out_features)
+            self.reset_parameters()
+
+    @property
+    def lora_ind(self) -> torch.Tensor:
+        """Lazy creation of a buffer with LoRA indices to overcome the limitation when FSDP with meta device is used."""
+        # Indices are needed to properly pad weight updates with zeros.
+        if not hasattr(self, "_lora_ind"):
+            enable_q, enable_k, enable_v = self.enable_lora
+            qkv_group_size = self.n_head // self.n_query_groups + 2
+            candidate_indices = range(self.linear.out_features)
             lora_ind = []
             if enable_q:
-                q_ind = [x for x in ind if (x // head_size) % total_qkv < total_qkv - 2]
+                q_ind = [x for x in candidate_indices if (x // self.head_size) % qkv_group_size < qkv_group_size - 2]
                 lora_ind.extend(q_ind)
             if enable_k:
-                k_ind = [x for x in ind if (x // head_size) % total_qkv == total_qkv - 2]
+                k_ind = [x for x in candidate_indices if (x // self.head_size) % qkv_group_size == qkv_group_size - 2]
                 lora_ind.extend(k_ind)
             if enable_v:
-                v_ind = [x for x in ind if (x // head_size) % total_qkv == total_qkv - 1]
+                v_ind = [x for x in candidate_indices if (x // self.head_size) % qkv_group_size == qkv_group_size - 1]
                 lora_ind.extend(v_ind)
-            self._lora_ind = torch.tensor(lora_ind)
-            self._lora_ind_cache = {self._lora_ind.device: self._lora_ind}
-            self.reset_parameters()
-
+            self.register_buffer(
+                "_lora_ind", torch.tensor(lora_ind, device=self.linear.weight.device), persistent=False
+            )
 
+        return self._lora_ind
 
     def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
-        """Properly pad weight updates with zeros.
+        """Properly pad the last dimension of weight updates with zeros.
 
         If, based on `self.enable_lora`, we want to fine-tune queries and values, but not keys,
         then the weights update should be:
@@ -332,20 +337,9 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
         # ⚬ enable_lora: [True, False, True]
         # Then x has embeddings_size of 256 (2 * 128 as enable_lora only for query and value, not keys) and expected
         # embeddings_size is 384 (self.linear.out_features), so that means that we need to pad from 256 to 384 with zeros, but
-        # only for key updates (this is where lora_ind comes in handy)
-        # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors
-        # for example when we want to merge/unmerge LoRA weights and pretrained weights
-        x = x.transpose(0, 1)
-        result = x.new_zeros((*x.shape[:-1], self.linear.out_features))  # (64, 64, 384)
-        result = result.view(-1, self.linear.out_features)  # (4096, 384)
-
-        # `lora_ind` is constant, so we want to avoid copying it (and incurring an expensive cudaStreamSynchronize)
-        # every time this method is called. So instead we simply cache a copy on each device that needs it.
-        if (lora_ind := self._lora_ind_cache.get(result.device)) is None:
-            self._lora_ind_cache[result.device] = lora_ind = self._lora_ind.to(result.device)
-
-        result = result.index_copy(1, lora_ind, x.reshape(-1, sum(self.qkv_shapes)))  # (4096, 256)
-        return result.view((*x.shape[:-1], self.linear.out_features)).transpose(0, 1)  # (64, 64, 384)
+        # only for key updates (this is where self.lora_ind comes in handy)
+        result = x.new_zeros(*x.shape[:-1], self.linear.out_features)  # (64, 64, 384)
+        return result.index_copy_(dim=-1, index=self.lora_ind, source=x)  # (64, 64, 384)
 
     def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
         """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries.
@@ -379,7 +373,8 @@ def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
         input_splitted = input.chunk(sum(self.enable_lora), dim=1)  # N * (B, C // N, T)
         weight_splitted = weight.split(self.qkv_shapes)  # N * (C_output', r, 1)
         return torch.cat(
-            [F.conv1d(a, b) for a, b in zip(input_splitted, weight_splitted)], dim=1  # (B, C_output', T)
+            [F.conv1d(a, b) for a, b in zip(input_splitted, weight_splitted)],
+            dim=1,  # (B, C_output', T)
         )  # (B, C_output, T)
 
     def get_lora_AB(self) -> torch.Tensor:
@@ -391,10 +386,8 @@ def get_lora_AB(self) -> torch.Tensor:
         lora = self.conv1d(
             self.lora_A.data.unsqueeze(0),  # (4, 128) -> (1, 4, 128)
             self.lora_B.data.unsqueeze(-1),  # (256, 2) -> (256, 2, 1)
-        ).squeeze(
-            0
-        )  # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128)
-        return self.zero_pad(lora * self.scaling)  # (256, 128) after zero_pad (384, 128)
+        ).squeeze(0)  # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128)
+        return self.zero_pad(lora.T * self.scaling).T  # (256, 128) after zero_pad (384, 128)
 
     def merge(self) -> None:
         """Merges the LoRA weights into the full-rank weights (W = W + delta_W)."""
@@ -432,9 +425,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         after_B = self.conv1d(
             after_A.transpose(-2, -1),  # (64, 64, 4) -> (64, 4, 64)
             self.lora_B.unsqueeze(-1),  # (256, 2) -> (256, 2, 1)
-        ).transpose(
-            -2, -1
-        )  # (64, 4, 64) @ (256, 2, 1) -> (64, 256, 64) -> (64, 64, 256)
+        ).transpose(-2, -1)  # (64, 4, 64) @ (256, 2, 1) -> (64, 256, 64) -> (64, 64, 256)
         lora = self.zero_pad(after_B) * self.scaling  # (64, 64, 256) after zero_pad (64, 64, 384)
         return pretrained + lora
 
diff --git a/tests/test_lora.py b/tests/test_lora.py
index 79dc5896a8..d283f1cf44 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -107,7 +107,7 @@ def test_lora_mqa_gqa():
     assert attn.linear.weight.shape == (24, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (16, 2)
-    assert torch.equal(attn._lora_ind, torch.tensor(lora_ind))
+    assert torch.equal(attn.lora_ind, torch.tensor(lora_ind))
     x = torch.randint(0, 8, size=(3, 5, 16), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 24)
     bsz, ctx_len, in_dim = 2, 30, 8
@@ -128,7 +128,7 @@ def test_lora_mqa_gqa():
     assert attn.linear.weight.shape == (12, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (10, 2)
-    assert torch.equal(attn._lora_ind, torch.tensor(lora_ind))
+    assert torch.equal(attn.lora_ind, torch.tensor(lora_ind))
     x = torch.randint(0, 8, size=(3, 5, 10), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 12)
     bsz, ctx_len, in_dim = 2, 30, 8
@@ -149,7 +149,7 @@ def test_lora_mqa_gqa():
     assert attn.linear.weight.shape == (16, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (12, 2)
-    assert torch.equal(attn._lora_ind, torch.tensor(lora_ind))
+    assert torch.equal(attn.lora_ind, torch.tensor(lora_ind))
     x = torch.randint(0, 8, size=(3, 5, 12), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 16)
     bsz, ctx_len, in_dim = 2, 30, 8
@@ -733,3 +733,28 @@ def test_lora_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca_pa
     logs = stdout.getvalue()
     assert "of trainable parameters: 512" in logs
     assert "of non-trainable parameters: 1,888" in logs
+
+
+@RunIf(standalone=True, min_cuda_gpus=2)
+def test_lora_model_fsdp_init():
+    config = Config(
+        n_layer=1,
+        n_head=2,
+        n_embd=8,
+        block_size=8,
+        vocab_size=8,
+        lora_r=8,
+        lora_alpha=8,
+        lora_dropout=0.1,
+        lora_query=True,
+        lora_value=False,
+        lora_projection=True,
+    )
+    fabric = Fabric(devices=2, strategy="fsdp", precision="16-true")
+    fabric.launch()
+    with fabric.init_module(empty_init=True):
+        model = LoRAGPT(config)
+    x = torch.randint(0, config.padded_vocab_size, size=(2, config.block_size), dtype=torch.int64, device=fabric.device)
+    model = fabric.setup(model)
+    y = model(x)
+    assert y.shape == torch.Size([2, 8, 512])

From 101e31d2e4d430b818e7e69974de6139800a36b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Mon, 6 May 2024 19:14:54 +0200
Subject: [PATCH 20/21] [thunder] Update `torch.compile` executor usage (#1264)

---
 extensions/thunder/README.md                  | 75 ++++++++++---------
 extensions/thunder/pretrain.py                |  2 -
 extensions/thunder/strategies/thunder_ddp.py  |  4 +-
 extensions/thunder/strategies/thunder_fsdp.py |  4 +-
 extensions/thunder/strategies/utils.py        | 28 -------
 pyproject.toml                                |  2 +-
 tests/test_thunder_fsdp.py                    | 16 ----
 7 files changed, 42 insertions(+), 89 deletions(-)
 delete mode 100644 extensions/thunder/strategies/utils.py

diff --git a/extensions/thunder/README.md b/extensions/thunder/README.md
index 715e7745fa..a494248907 100644
--- a/extensions/thunder/README.md
+++ b/extensions/thunder/README.md
@@ -461,12 +461,12 @@ from extensions.thunder.strategies import ThunderFSDPStrategy, ThunderDDPStrateg
 strategy = ThunderFSDPStrategy(
     sharding_strategy="ZERO3",
     bucketing_strategy="BLOCK",
-    executors=("sdpa", "torchcompile", "nvfuser", "torch"),
+    executors=("sdpa", "torchcompile_cat", "nvfuser", "torch"),
     state_dict_type="full",
 )
 
 # replicated data parallel
-strategy = ThunderDDPStrategy(executors=("sdpa", "torchcompile", "nvfuser", "torch"))
+strategy = ThunderDDPStrategy(executors=("sdpa", "torchcompile_cat", "nvfuser", "torch"))
 
 fabric = L.Fabric(devices=devices, strategy=strategy)
 fabric.launch()
@@ -482,12 +482,10 @@ Thunder allows you to define a priority list of executors that can map operators
 
 ```python
 import thunder
-from thunder.executors.sdpaex import sdpa_ex
-from thunder.executors.torch_compile import torch_compile_executor
 
 model = thunder.jit(
     model,
-    executors=[sdpa_ex, torch_compile_executor, thunder.nvfuser_executor, thunder.pytorch_executor]
+    executors=["sdpa", "torchcompile_cat", "nvfuser", "torch"]
 )
 ```
 
@@ -507,11 +505,11 @@ We can enable this executor by passing it to the list of executors available. Th
 `NvFuser` creates its fusion regions.
 
 ```python
-from unsloth.executor import unsloth_ex
+import thunder
 
 model = thunder.jit(
     model,
-    executors=[sdpa_ex, unsloth_ex, torch_compile_executor, thunder.nvfuser_executor, thunder.pytorch_executor]
+    executors=["sdpa", "unsloth", "torchcompile_cat", "nvfuser", "torch"]
 )
 ```
 
@@ -543,21 +541,24 @@ Given the Unsloth results below, these hand-written kernels do not seem to be wo
 
 We provide a version of the main pre-training script [that integrates Thunder](pretrain.py) that uses TinyLlama, a 1.1B parameter LLM.
 
-| Setting              | Compiler/JIT | Devices | ms/iter @ step 10 | Memory (GB)   |
-|----------------------|--------------|---------|-------------------|---------------|
-| Fully-sharded ZeRO 3 | Eager        | 8       | 460.88            | 22.13         |
-| Fully-sharded ZeRO 3 | Inductor     | 8       | Not supported     | Not supported |
-| Fully-sharded ZeRO 3 | Thunder      | 8       | 332.48            | 21.40         |
-|                      |              |         |                   |               |
-| Replicated           | Eager        | 8       | 535.28            | 32.05         |
-| Replicated           | Inductor     | 8       | Not supported     | Not supported |
-| Replicated           | Thunder      | 8       | 368.25            | 27.42         |
-|                      |              |         |                   |               |
-| -                    | Eager        | 1       | 449.88            | 29.85         |
-| -                    | Inductor     | 1       | Not supported     | Not supported |
-| -                    | Thunder      | 1       | 323.78            | 27.42         |
-|                      |              |         |                   |               |
-| Unsloth              | Thunder      | 1       | 334.98            | 25.19         |
+| Setting              | Compiler | Executors                              | Devices | ms/iter @ step 10 | Memory (GB)   |
+|----------------------|----------|----------------------------------------|---------|-------------------|---------------|
+| Fully-sharded ZeRO 3 | Eager    | -                                      | 8       | 456.57            | 22.13         |
+| Fully-sharded ZeRO 3 | torch    | -                                      | 8       | Not supported     | Not supported |
+| Fully-sharded ZeRO 3 | Thunder  | sdpa, torchcompile                     | 8       | Not supported     | Not supported |
+| Fully-sharded ZeRO 3 | Thunder  | sdpa, torchcompile_cat, nvfuser, torch | 8       | 333.56            | 21.40         |
+|                      |          |                                        |         |                   |               |
+| Replicated           | Eager    | -                                      | 8       | 569.46            | 32.04         |
+| Replicated           | torch    | -                                      | 8       | Not supported     | Not supported |
+| Replicated           | Thunder  | sdpa, torchcompile                     | 8       | 426.44            | 22.19         |
+| Replicated           | Thunder  | sdpa, torchcompile_cat, nvfuser, torch | 8       | 356.01            | 27.42         |
+|                      |          |                                        |         |                   |               |
+| -                    | Eager    | -                                      | 1       | 447.65            | 29.84         |
+| -                    | torch    | -                                      | 1       | Not supported     | Not supported |
+| -                    | Thunder  | sdpa, torchcompile                     | 1       | 373.37            | 22.19         |
+| -                    | Thunder  | sdpa, torchcompile_cat, nvfuser, torch | 1       | 322.25            | 27.42         |
+|                      |          |                                        |         |                   |               |
+| Unsloth              | Thunder  | sdpa, torchcompile_cat, nvfuser, torch | 1       | 331.92            | 25.19         |
 
 <details>
 <summary>Reproduction details</summary>
@@ -567,45 +568,47 @@ Config:
 ```yaml
 out_dir: out/pretrain-thunder
 data: TinyStories
-tokenizer_dir: checkpoints/meta-llama/Llama-2-7b-hf
+tokenizer_dir: checkpoints/TinyLlama/TinyLlama-1.1B-Chat-v1.0
 logger_name: csv
 ```
 
 Commands:
 
 ```bash
+litgpt download --repo_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tokenizer_only true
+
 python extensions/thunder/pretrain.py --config config.yaml --compiler null --train.global_batch_size 32
-python extensions/thunder/pretrain.py --config config.yaml --compiler torch --train.global_batch_size 32
-python extensions/thunder/pretrain.py --config config.yaml --executors '[sdpa, torchcompile, nvfuser, torch]' --train.global_batch_size 32
+python extensions/thunder/pretrain.py --config config.yaml --executors '[sdpa, torchcompile]' --train.global_batch_size 32
+python extensions/thunder/pretrain.py --config config.yaml --executors '[sdpa, torchcompile_cat, nvfuser, torch]' --train.global_batch_size 32
 
 python extensions/thunder/pretrain.py --config config.yaml --compiler null --strategy ddp
-python extensions/thunder/pretrain.py --config config.yaml --compiler torch --strategy ddp
-python extensions/thunder/pretrain.py --config config.yaml --executors '[sdpa, torchcompile, nvfuser, torch]' --strategy ddp
+python extensions/thunder/pretrain.py --config config.yaml --executors '[sdpa, torchcompile]' --strategy ddp
+python extensions/thunder/pretrain.py --config config.yaml --executors '[sdpa, torchcompile_cat, nvfuser, torch]' --strategy ddp
 
 python extensions/thunder/pretrain.py --config config.yaml --compiler null --devices 1
-python extensions/thunder/pretrain.py --config config.yaml --compiler torch --devices 1
-python extensions/thunder/pretrain.py --config config.yaml --executors '[sdpa, torchcompile, nvfuser, torch]' --devices 1
+python extensions/thunder/pretrain.py --config config.yaml --executors '[sdpa, torchcompile]' --devices 1
+python extensions/thunder/pretrain.py --config config.yaml --executors '[sdpa, torchcompile_cat, nvfuser, torch]' --devices 1
 
-python extensions/thunder/pretrain.py --config config.yaml --executors '[sdpa, unsloth, torchcompile, nvfuser, torch]' --devices 1
+python extensions/thunder/pretrain.py --config config.yaml --executors '[sdpa, unsloth, torchcompile_cat, nvfuser, torch]' --devices 1
 ```
 
 Gradient accumulation is disabled in the FSDP setting because Thunder does not support skipping the backward synchronization yet.
 
-`torch.compile` does not support compiling the `_FabricModule` due to this issue: https://github.com/pytorch/pytorch/issues/112787#issuecomment-1986827601
+`--compiler torch` (`torch.compile` without `thunder`) is not include because it does not support compiling the `_FabricModule` due to this issue: https://github.com/pytorch/pytorch/issues/112787#issuecomment-1986827601
 
 The CUDA devices are all NVIDIA A100-SXM4-40GB.
 
 ```text
-Python version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)
+Python version: 3.10.12 [GCC 11.4.0] (64-bit runtime)
 Is debug build: False
 CUDA used to build PyTorch: 12.1
 CUDA runtime version: 12.3.107
 Nvidia driver version: 545.23.08
-pytorch-triton==3.0.0+989adb9a29
-torch==2.4.0.dev20240326+cu121
+pytorch-triton==3.0.0+45fff310c8
+torch==2.4.0.dev20240427+cu121
 lightning==2.3.0.dev20240328
-lightning-thunder==0.2.0.dev20240404
-nvfuser_cu121==0.2.0.dev20240327
+lightning-thunder==0.2.0.dev20240505
+nvfuser_cu121==0.2.3.dev20240428
 ```
 
 </details>
diff --git a/extensions/thunder/pretrain.py b/extensions/thunder/pretrain.py
index 24f140c9df..6aa77a745f 100644
--- a/extensions/thunder/pretrain.py
+++ b/extensions/thunder/pretrain.py
@@ -482,9 +482,7 @@ def jit(fn: Callable, executors: List[str]) -> Any:
     assert executors is not None
     import thunder
     from unsloth.executor import unsloth_ex  # import for registration  # noqa: F401
-    from strategies.utils import _validate_executors
 
-    executors = _validate_executors(executors)
     return thunder.jit(fn, executors=executors)
 
 
diff --git a/extensions/thunder/strategies/thunder_ddp.py b/extensions/thunder/strategies/thunder_ddp.py
index 29b2af8980..9717e9b3a1 100644
--- a/extensions/thunder/strategies/thunder_ddp.py
+++ b/extensions/thunder/strategies/thunder_ddp.py
@@ -28,8 +28,6 @@
 from torch.nn import Module
 from typing_extensions import override
 
-from .utils import _validate_executors
-
 if TYPE_CHECKING:
     from thunder import Executor
 
@@ -74,7 +72,7 @@ def __init__(
         if not jit and executors is not None:
             raise ValueError(f"Passing executors={executors} doesn't have an effect with `jit={jit}`")
         self.jit = jit
-        self.executors = _validate_executors(executors)
+        self.executors = executors
         self._num_nodes = 1
         self._process_group_backend: Optional[str] = process_group_backend
         self._timeout: Optional[timedelta] = timeout
diff --git a/extensions/thunder/strategies/thunder_fsdp.py b/extensions/thunder/strategies/thunder_fsdp.py
index 55b30bdf66..fe1719b29c 100644
--- a/extensions/thunder/strategies/thunder_fsdp.py
+++ b/extensions/thunder/strategies/thunder_fsdp.py
@@ -32,8 +32,6 @@
 from torch.optim import Optimizer
 from typing_extensions import override
 
-from .utils import _validate_executors
-
 if TYPE_CHECKING:
     from thunder import Executor
     from thunder.distributed import FSDPBucketingStrategy, FSDPType
@@ -122,7 +120,7 @@ def __init__(
         if not jit and executors is not None:
             raise ValueError(f"Passing executors={executors} doesn't have an effect with `jit={jit}`")
         self.jit = jit
-        self.executors = _validate_executors(executors)
+        self.executors = executors
         self._state_dict_type = state_dict_type
         self._fsdp_kwargs = kwargs
 
diff --git a/extensions/thunder/strategies/utils.py b/extensions/thunder/strategies/utils.py
deleted file mode 100644
index b7132cdbf1..0000000000
--- a/extensions/thunder/strategies/utils.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from typing import TYPE_CHECKING, Optional, Tuple, Union, Sequence
-
-if TYPE_CHECKING:
-    from thunder import Executor
-
-
-def _validate_executors(executors: Optional[Sequence[Union["Executor", str]]]) -> Optional[Tuple["Executor", ...]]:
-    """Converts string executors into it's respective ``Executor`` object."""
-    if executors is None:
-        return None
-    from thunder import get_all_executors
-
-    final = []
-    issues = []
-    all = get_all_executors()
-    for executor in executors:
-        if isinstance(executor, str):
-            for existing in all:
-                if executor == existing.name:
-                    final.append(existing)
-                    break
-            else:
-                issues.append(executor)
-        else:
-            final.append(executor)
-    if issues:
-        raise ValueError(f"Did not find the executors {issues} in {all}")
-    return tuple(final)
diff --git a/pyproject.toml b/pyproject.toml
index ab7b9b26f3..40029ae13b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,7 @@ test = [
     "transformers>=4.38.0",  # numerical comparisons
     "einops>=0.7.0",
     "protobuf>=4.23.4",
-    "lightning-thunder==0.2.0.dev20240404; python_version >= '3.10'",
+    "lightning-thunder==0.2.0.dev20240505; python_version >= '3.10'",
 ]
 all = [
     "bitsandbytes==0.42.0",      # quantization
diff --git a/tests/test_thunder_fsdp.py b/tests/test_thunder_fsdp.py
index 2c97ad626d..84de117574 100644
--- a/tests/test_thunder_fsdp.py
+++ b/tests/test_thunder_fsdp.py
@@ -1,5 +1,4 @@
 import os
-import re
 import sys
 from pathlib import Path
 from typing import Optional, Tuple, Union
@@ -15,35 +14,20 @@
 sys.path.append(str(wd))
 
 from extensions.thunder.strategies.thunder_fsdp import ThunderFSDPStrategy
-from extensions.thunder.strategies.utils import _validate_executors
 
 
 @RunIf(thunder=True)
 def test_thunder_strategy_input_parsing():
-    from thunder import pythonex
     from thunder.distributed import FSDPBucketingStrategy, FSDPType
 
     strategy = ThunderFSDPStrategy(bucketing_strategy="BlOcK", executors=("python",), sharding_strategy="zero3")
     assert strategy.bucketing_strategy is FSDPBucketingStrategy.BLOCK
-    assert strategy.executors == (pythonex,)
     assert strategy.sharding_strategy is FSDPType.ZERO3
 
     with pytest.raises(ValueError, match="doesn't have an effect with `jit=False"):
         ThunderFSDPStrategy(jit=False, executors=("python",))
 
 
-@RunIf(thunder=True)
-def test_validate_executors():
-    from thunder import pythonex, pytorch_executor
-
-    assert _validate_executors(None) is None
-    assert _validate_executors((pythonex, pytorch_executor)) == (pythonex, pytorch_executor)
-    assert _validate_executors(("python", "torch")) == (pythonex, pytorch_executor)
-    assert _validate_executors(("python", pytorch_executor)) == (pythonex, pytorch_executor)
-    with pytest.raises(ValueError, match=re.escape("not find the executors ['foo', 'bar'] in")):
-        assert _validate_executors(("python", "foo", pytorch_executor, "bar"))
-
-
 @RunIf(thunder=True)
 def test_save_checkpoint_invalid_settings_raise(tmp_path):
     strategy = ThunderFSDPStrategy(state_dict_type="full")

From 90a16e4147f1ee649adf1a771f22bb88d6fb2a54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Mon, 6 May 2024 19:15:47 +0200
Subject: [PATCH 21/21] Remove duplicate MistralForCausalLM test (#1390)

---
 tests/test_model.py | 58 ---------------------------------------------
 1 file changed, 58 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index b8b6366fb7..1cad36a8db 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -415,64 +415,6 @@ def test_against_hf_mixtral():
     torch.testing.assert_close(ours_y, theirs_y)
 
 
-@torch.inference_mode()
-@pytest.mark.parametrize(
-    ("device", "dtype"),
-    [
-        (torch.device("cpu"), torch.float32),
-        pytest.param(
-            torch.device("cuda"),
-            torch.float16,
-            marks=[
-                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
-                # is slightly different
-                pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
-            ],
-        ),
-    ],
-)
-def test_against_hf_h2o_danube(device, dtype):
-    torch.set_default_dtype(dtype)
-
-    ours_config = Config.from_name(
-        "Danube2-1.8b-chat",
-        padded_vocab_size=10000,
-        n_layer=2,
-        n_embd=16,
-        n_head=8,
-        n_query_groups=2,
-        intermediate_size=43,
-    )
-    T = 5
-    theirs_config = MistralConfig(
-        vocab_size=ours_config.padded_vocab_size,
-        hidden_size=ours_config.n_embd,
-        num_attention_heads=ours_config.n_head,
-        num_hidden_layers=ours_config.n_layer,
-        intermediate_size=ours_config.intermediate_size,
-        max_position_embeddings=T,
-        rms_norm_eps=ours_config.norm_eps,
-        num_key_value_heads=ours_config.n_query_groups,
-        rope_theta=ours_config.rope_base,
-    )
-    assert ours_config.intermediate_size == theirs_config.intermediate_size
-
-    theirs_model = MistralForCausalLM(theirs_config).to(device)
-    theirs_state_dict = theirs_model.state_dict()
-    state_dict = {}
-    copy_weights_hf_llama(ours_config, {}, state_dict, theirs_state_dict)
-    ours_model = GPT(ours_config).to(device)
-    ours_model.load_state_dict(state_dict)
-
-    # test end to end
-    x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
-    assert x.size(1) == T
-    ours_y = ours_model(x)
-    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
-    torch.testing.assert_close(ours_y, theirs_y)
-
-
 @torch.inference_mode()
 @pytest.mark.parametrize(
     ("device", "dtype"),