From 3a6e637c86b6ee85c70b76fd6ebd3898e127773e Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Mon, 18 Sep 2023 15:06:38 +0300
Subject: [PATCH 01/14] Use only `index_copy` without any views or reshapes.

---
 lit_gpt/lora.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py
index 7645582cf7..f7d0f31b4d 100644
--- a/lit_gpt/lora.py
+++ b/lit_gpt/lora.py
@@ -288,12 +288,9 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
         # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors
         # for example when we want to merge/unmerge LoRA weights and pretrained weights
         x = x.transpose(0, 1)
-        result = x.new_zeros((*x.shape[:-1], self.linear.out_features))  # (64, 64, 384)
-        result = result.view(-1, self.linear.out_features)  # (4096, 384)
-        result = result.index_copy(
-            1, torch.tensor(self.lora_ind, device=result.device), x.reshape(-1, sum(self.qkv_shapes))
-        )  # (4096, 256)
-        return result.view((*x.shape[:-1], self.linear.out_features)).transpose(0, 1)  # (64, 64, 384)
+        result = x.new_zeros(*x.shape[:-1], self.linear.out_features)  # (64, 64, 384)
+        result.index_copy_(dim=-1, index=torch.tensor(self.lora_ind, device=result.device), source=x)  # (64, 64, 384)
+        return result.transpose(0, 1)  # (64, 64, 384)
 
     def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
         """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries.

From 1d591f236a65e4b9f720a6a5ba028e0aa8214079 Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Mon, 18 Sep 2023 15:10:00 +0300
Subject: [PATCH 02/14] Don't do transpose required for the merge method in
 each `zero_pad` call.

---
 lit_gpt/lora.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py
index f7d0f31b4d..013a4ed948 100644
--- a/lit_gpt/lora.py
+++ b/lit_gpt/lora.py
@@ -254,7 +254,7 @@ def __init__(
             self.reset_parameters()
 
     def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
-        """Properly pad weight updates with zeros.
+        """Properly pad the last dimension of weight updates with zeros.
 
         If, based on `self.enable_lora`, we want to fine-tune queries and values, but not keys,
         then the weights update should be:
@@ -287,10 +287,9 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
         # only for key updates (this is where self.lora_ind comes in handy)
         # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors
         # for example when we want to merge/unmerge LoRA weights and pretrained weights
-        x = x.transpose(0, 1)
         result = x.new_zeros(*x.shape[:-1], self.linear.out_features)  # (64, 64, 384)
         result.index_copy_(dim=-1, index=torch.tensor(self.lora_ind, device=result.device), source=x)  # (64, 64, 384)
-        return result.transpose(0, 1)  # (64, 64, 384)
+        return result  # (64, 64, 384)
 
     def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
         """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries.
@@ -342,7 +341,7 @@ def merge(self):
                 0
             )  # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128)
             # W = W + delta_W (merge)
-            self.linear.weight.data += self.zero_pad(delta_w * self.scaling)  # (256, 128) after zero_pad (384, 128)
+            self.linear.weight.data += self.zero_pad(delta_w.T * self.scaling).T  # (256, 128) after zero_pad (384, 128)
             self.merged = True
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:

From 3d068cb382cbd333b97e3ff201db5b50ddc1d832 Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Mon, 18 Sep 2023 18:11:07 +0300
Subject: [PATCH 03/14] `self.lora_ind` as a property

---
 lit_gpt/lora.py    | 60 ++++++++++++++++++++++++++++++----------------
 tests/test_lora.py |  6 ++---
 2 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py
index 013a4ed948..43873e1540 100644
--- a/lit_gpt/lora.py
+++ b/lit_gpt/lora.py
@@ -233,26 +233,47 @@ def __init__(
             # https://github.com/cloneofsimo/lora
             self.scaling = self.lora_alpha / self.r
 
-            # Compute the indices
-            # Indices are needed to properly pad weight updates with zeros. If we want to fine-tune queries and values,
-            # but not keys, then the weights update should be:
-            #
-            # [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,],
-            #  [....................................],
-            #  [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,]]
-            #      ↑              ↑            ↑
-            # ________________________________________
-            # | query         | key       | value    |
-            # ----------------------------------------
-            self.lora_ind = []
-            if enable_q:
-                self.lora_ind.extend(range(0, self.linear.in_features))
-            if enable_k:
-                self.lora_ind.extend(range(self.linear.in_features, self.linear.in_features + self.kv_embd_size))
-            if enable_v:
-                self.lora_ind.extend(range(self.linear.in_features + self.kv_embd_size, self.linear.out_features))
             self.reset_parameters()
 
+    @property
+    def lora_ind(self) -> torch.Tensor:
+        # Compute the indices
+        # Indices are needed to properly pad weight updates with zeros. If we want to fine-tune queries and values,
+        # but not keys, then the weights update should be:
+        #
+        # [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,],
+        #  [....................................],
+        #  [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,]]
+        #      ↑              ↑            ↑
+        # ________________________________________
+        # | query         | key       | value    |
+        # ----------------------------------------
+        if hasattr(self, "_lora_ind"):
+            return self._lora_ind
+
+        indices = []
+        enable_q, enable_k, enable_v = self.enable_lora
+        if enable_q:
+            indices.append(torch.arange(0, self.linear.in_features, device=self.linear.weight.device))
+        if enable_k:
+            indices.append(
+                torch.arange(
+                    self.linear.in_features,
+                    self.linear.in_features + self.kv_embd_size,
+                    device=self.linear.weight.device,
+                )
+            )
+        if enable_v:
+            indices.append(
+                torch.arange(
+                    self.linear.in_features + self.kv_embd_size,
+                    self.linear.out_features,
+                    device=self.linear.weight.device,
+                )
+            )
+        self.register_buffer("_lora_ind", torch.cat(indices), persistent=False)
+        return self._lora_ind
+
     def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
         """Properly pad the last dimension of weight updates with zeros.
 
@@ -288,8 +309,7 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
         # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors
         # for example when we want to merge/unmerge LoRA weights and pretrained weights
         result = x.new_zeros(*x.shape[:-1], self.linear.out_features)  # (64, 64, 384)
-        result.index_copy_(dim=-1, index=torch.tensor(self.lora_ind, device=result.device), source=x)  # (64, 64, 384)
-        return result  # (64, 64, 384)
+        return result.index_copy(dim=-1, index=self.lora_ind.clone(), source=x)  # (64, 64, 384)
 
     def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
         """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries.
diff --git a/tests/test_lora.py b/tests/test_lora.py
index 38ac1655cd..8eff27bc92 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -92,7 +92,7 @@ def test_lora_mqa_gqa():
     assert attn.linear.weight.shape == (24, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (16, 2)
-    assert attn.lora_ind == [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
+    assert torch.equal(attn.lora_ind, torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]))
     x = torch.randint(0, 8, size=(3, 5, 16), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 24)
 
@@ -103,7 +103,7 @@ def test_lora_mqa_gqa():
     assert attn.linear.weight.shape == (12, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (10, 2)
-    assert attn.lora_ind == [0, 1, 2, 3, 4, 5, 6, 7, 10, 11]
+    assert torch.equal(attn.lora_ind, torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11]))
     x = torch.randint(0, 8, size=(3, 5, 10), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 12)
 
@@ -114,7 +114,7 @@ def test_lora_mqa_gqa():
     assert attn.linear.weight.shape == (16, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (12, 2)
-    assert attn.lora_ind == [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15]
+    assert torch.equal(attn.lora_ind, torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15]))
     x = torch.randint(0, 8, size=(3, 5, 12), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 16)
 

From 89d8eaafd0e5562e8860136effb8f8cbc1601a7b Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Tue, 19 Sep 2023 15:51:03 +0300
Subject: [PATCH 04/14] Updates for a case with an inference tensor

---
 lit_gpt/lora.py | 57 +++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py
index 43873e1540..aa6a1421e5 100644
--- a/lit_gpt/lora.py
+++ b/lit_gpt/lora.py
@@ -248,31 +248,38 @@ def lora_ind(self) -> torch.Tensor:
         # ________________________________________
         # | query         | key       | value    |
         # ----------------------------------------
-        if hasattr(self, "_lora_ind"):
-            return self._lora_ind
-
-        indices = []
-        enable_q, enable_k, enable_v = self.enable_lora
-        if enable_q:
-            indices.append(torch.arange(0, self.linear.in_features, device=self.linear.weight.device))
-        if enable_k:
-            indices.append(
-                torch.arange(
-                    self.linear.in_features,
-                    self.linear.in_features + self.kv_embd_size,
-                    device=self.linear.weight.device,
+        if not hasattr(self, "_lora_ind"):
+            indices = []
+            enable_q, enable_k, enable_v = self.enable_lora
+            if enable_q:
+                indices.append(
+                    torch.arange(
+                        0,
+                        self.linear.in_features,
+                        device=self.linear.weight.device,
+                    )
                 )
-            )
-        if enable_v:
-            indices.append(
-                torch.arange(
-                    self.linear.in_features + self.kv_embd_size,
-                    self.linear.out_features,
-                    device=self.linear.weight.device,
+            if enable_k:
+                indices.append(
+                    torch.arange(
+                        self.linear.in_features,
+                        self.linear.in_features + self.kv_embd_size,
+                        device=self.linear.weight.device,
+                    )
                 )
-            )
-        self.register_buffer("_lora_ind", torch.cat(indices), persistent=False)
-        return self._lora_ind
+            if enable_v:
+                indices.append(
+                    torch.arange(
+                        self.linear.in_features + self.kv_embd_size,
+                        self.linear.out_features,
+                        device=self.linear.weight.device,
+                    )
+                )
+            self.register_buffer("_lora_ind", torch.cat(indices), persistent=False)
+
+        # in case `lora_ind` was created in `inference_mode` and thus it's an inference tensor,
+        # that cannot be saved for backward and has to be cloned
+        return self._lora_ind.clone() if self._lora_ind.is_inference() else self._lora_ind
 
     def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
         """Properly pad the last dimension of weight updates with zeros.
@@ -306,10 +313,8 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
         # Then x has embeddings_size of 256 (2 * 128 as enable_lora only for query and value, not keys) and expected
         # embeddings_size is 384 (self.linear.out_features), so that means that we need to pad from 256 to 384 with zeros, but
         # only for key updates (this is where self.lora_ind comes in handy)
-        # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors
-        # for example when we want to merge/unmerge LoRA weights and pretrained weights
         result = x.new_zeros(*x.shape[:-1], self.linear.out_features)  # (64, 64, 384)
-        return result.index_copy(dim=-1, index=self.lora_ind.clone(), source=x)  # (64, 64, 384)
+        return result.index_copy_(dim=-1, index=self.lora_ind, source=x)  # (64, 64, 384)
 
     def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
         """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries.

From 6785012bf528fa1c3414a9f74f6a6214bc697cf8 Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Tue, 19 Sep 2023 17:23:56 +0300
Subject: [PATCH 05/14] Docstring for a `lora_ind` property.

---
 lit_gpt/lora.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py
index aa6a1421e5..4d6ad1e1dc 100644
--- a/lit_gpt/lora.py
+++ b/lit_gpt/lora.py
@@ -237,6 +237,7 @@ def __init__(
 
     @property
     def lora_ind(self) -> torch.Tensor:
+        """Lazy creation of a buffer with LoRA indices to overcome limitation when FSDP and meta device is used."""
         # Compute the indices
         # Indices are needed to properly pad weight updates with zeros. If we want to fine-tune queries and values,
         # but not keys, then the weights update should be:

From 8951570e1cf61670311b1beba537b8e2d5a39bf2 Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Tue, 19 Sep 2023 18:28:43 +0300
Subject: [PATCH 06/14] Reassign `self._lora_ind` so it will be recreated
 outside inference mode

---
 lit_gpt/lora.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py
index 4d6ad1e1dc..c2b4aee269 100644
--- a/lit_gpt/lora.py
+++ b/lit_gpt/lora.py
@@ -280,7 +280,9 @@ def lora_ind(self) -> torch.Tensor:
 
         # in case `lora_ind` was created in `inference_mode` and thus it's an inference tensor,
         # that cannot be saved for backward and has to be cloned
-        return self._lora_ind.clone() if self._lora_ind.is_inference() else self._lora_ind
+        if self._lora_ind.is_inference():
+            self._lora_ind = self._lora_ind.clone()
+        return self._lora_ind
 
     def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
         """Properly pad the last dimension of weight updates with zeros.

From eb73d271a256b8572327a18f97b8518c2322d219 Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Fri, 22 Sep 2023 17:31:24 +0300
Subject: [PATCH 07/14] Make `lora_ind` property a bit shorter.

---
 lit_gpt/lora.py | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py
index c2b4aee269..b99b1f367c 100644
--- a/lit_gpt/lora.py
+++ b/lit_gpt/lora.py
@@ -252,30 +252,14 @@ def lora_ind(self) -> torch.Tensor:
         if not hasattr(self, "_lora_ind"):
             indices = []
             enable_q, enable_k, enable_v = self.enable_lora
+            in_features, out_features = self.linear.in_features, self.linear.out_features
+            device = self.linear.weight.device
             if enable_q:
-                indices.append(
-                    torch.arange(
-                        0,
-                        self.linear.in_features,
-                        device=self.linear.weight.device,
-                    )
-                )
+                indices.append(torch.arange(0, in_features, device=device))
             if enable_k:
-                indices.append(
-                    torch.arange(
-                        self.linear.in_features,
-                        self.linear.in_features + self.kv_embd_size,
-                        device=self.linear.weight.device,
-                    )
-                )
+                indices.append(torch.arange(in_features, in_features + self.kv_embd_size, device=device))
             if enable_v:
-                indices.append(
-                    torch.arange(
-                        self.linear.in_features + self.kv_embd_size,
-                        self.linear.out_features,
-                        device=self.linear.weight.device,
-                    )
-                )
+                indices.append(torch.arange(in_features + self.kv_embd_size, out_features, device=device))
             self.register_buffer("_lora_ind", torch.cat(indices), persistent=False)
 
         # in case `lora_ind` was created in `inference_mode` and thus it's an inference tensor,

From 4cc6cb38fe660e1e4d166bf4e95e808780a3a3ac Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Fri, 22 Sep 2023 17:39:12 +0300
Subject: [PATCH 08/14] Trim comments for `lora_ind` property.

---
 lit_gpt/lora.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py
index b99b1f367c..864fa70d31 100644
--- a/lit_gpt/lora.py
+++ b/lit_gpt/lora.py
@@ -237,18 +237,8 @@ def __init__(
 
     @property
     def lora_ind(self) -> torch.Tensor:
-        """Lazy creation of a buffer with LoRA indices to overcome limitation when FSDP and meta device is used."""
-        # Compute the indices
-        # Indices are needed to properly pad weight updates with zeros. If we want to fine-tune queries and values,
-        # but not keys, then the weights update should be:
-        #
-        # [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,],
-        #  [....................................],
-        #  [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,]]
-        #      ↑              ↑            ↑
-        # ________________________________________
-        # | query         | key       | value    |
-        # ----------------------------------------
+        """Lazy creation of a buffer with LoRA indices to overcome the limitation when FSDP with meta device is used."""
+        # Indices are needed to properly pad weight updates with zeros.
         if not hasattr(self, "_lora_ind"):
             indices = []
             enable_q, enable_k, enable_v = self.enable_lora

From 7013fed8c15fd7e5a4d285b576532f96972521cc Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Mon, 9 Oct 2023 18:23:24 +0300
Subject: [PATCH 09/14] If validate is running in no_grad mode there is no need
 to clone ind

---
 finetune/lora.py | 2 +-
 lit_gpt/lora.py  | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/finetune/lora.py b/finetune/lora.py
index cc2216fb5d..bbcde868d2 100644
--- a/finetune/lora.py
+++ b/finetune/lora.py
@@ -250,7 +250,7 @@ def train(
             save_lora_checkpoint(fabric, model, checkpoint_path)
 
 
-@torch.inference_mode()
+@torch.no_grad()
 def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer) -> torch.Tensor:
     fabric.print("Validating ...")
     model.eval()
diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py
index 6ae7a6baa7..b98fafcabb 100644
--- a/lit_gpt/lora.py
+++ b/lit_gpt/lora.py
@@ -252,10 +252,6 @@ def lora_ind(self) -> torch.Tensor:
                 indices.append(torch.arange(in_features + self.kv_embd_size, out_features, device=device))
             self.register_buffer("_lora_ind", torch.cat(indices), persistent=False)
 
-        # in case `lora_ind` was created in `inference_mode` and thus it's an inference tensor,
-        # that cannot be saved for backward and has to be cloned
-        if self._lora_ind.is_inference():
-            self._lora_ind = self._lora_ind.clone()
         return self._lora_ind
 
     def zero_pad(self, x: torch.Tensor) -> torch.Tensor:

From 4b20c886626020beb6b004f38696c1f55df7cd82 Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Tue, 21 Nov 2023 13:13:40 +0300
Subject: [PATCH 10/14] Revert "Minor tutorial updates"

This reverts commit e4fb763d7047c870a5c52e55aea2123a6b29b37a.
---
 tutorials/inference.md       |   4 +-
 tutorials/prepare_dataset.md |   6 +-
 tutorials/resource-tables.md | 140 +++++++++++++++++------------------
 3 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/tutorials/inference.md b/tutorials/inference.md
index 91366bab87..bb21cef4db 100644
--- a/tutorials/inference.md
+++ b/tutorials/inference.md
@@ -41,7 +41,7 @@ For instance, `falcon-40b` would require ~80 GB of GPU memory to run on a single
 python generate/base.py --checkpoint_dir checkpoints/tiiuae/falcon-40b --strategy fsdp --devices 4
 ```
 
-Which will take ~25 GB of memory, and run at 2.5 tokens/sec.
+Which will take 32 GB of memory, and run at 0.37 tokens/sec.
 
 Or to reduce the memory requirements even further, you can try using CPU offloading. For that, you will need to manually edit the `cpu_offload=False` parameter in the file and set it to `True`.
 
@@ -51,5 +51,5 @@ Now we can run it on just 2 devices.
 python generate/base.py --checkpoint_dir checkpoints/tiiuae/falcon-40b --strategy fsdp --devices 2
 ```
 
-taking ~5 GB of memory but running at 0.23 tokens/sec on 2 A100 40GB GPUs.
+taking 13 GB of memory but running at 0.12 tokens/sec on 2 A100 40GB GPUs.
 Smaller devices like 3090s (24 GB) can also fit it with this technique.
diff --git a/tutorials/prepare_dataset.md b/tutorials/prepare_dataset.md
index 77af9fc75f..f21d16b87c 100644
--- a/tutorials/prepare_dataset.md
+++ b/tutorials/prepare_dataset.md
@@ -8,10 +8,10 @@ Below is a table of all datasets that are currently supported in Lit-GPT:
 | Alpaca       | Finetuning  | 51,759 samples      | [URL](https://github.com/tatsu-lab/stanford_alpaca)             | [URL](https://crfm.stanford.edu/2023/03/13/alpaca.html)                                                                   | Attribution-NonCommercial 4.0 International, [ URL](https://crfm.stanford.edu/2023/03/13/alpaca.html)                                                                                                            |
 | Alpaca Libre | Finetuning  | 55,370 samples      | [URL](https://github.com/mobarski/alpaca-libre)                 | -                                                                                                                         | CC0/MIT,  [URL](https://github.com/mobarski/alpaca-libre)                                                                                                                                                        |
 | Dolly        | Finetuning  | 15,011 samples      | [URL](https://github.com/databrickslabs/dolly/tree/master/data) | [URL](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm)              | CC-BY-SA, [URL](https://github.com/databrickslabs/dolly#model-overview)                                                                                                                                          |
-| LongForm     | Finetuning  | 23,652 samples      | [URL](https://github.com/akoksal/LongForm)                      | [URL](https://arxiv.org/abs/2304.08460)                                                                                   | No information provided and subset-dependent, [URL](https://github.com/akoksal/LongForm)                                                                                                                         |
+| LongForm     | Finetuning  | 23,652 samples      | [URL](https://github.com/akoksal/LongForm)                      | [URL](https://arxiv.org/abs/2304.08460)                                                                                   | No information provided and subset-dependent, [URL](https://github.com/akoksal/LongForm) |
 | LIMA         | Finetuning  | 1,084 samples       | [URL](https://huggingface.co/datasets/GAIR/lima)                | [URL](https://arxiv.org/abs/2305.11206)                                                                                   | "If the source data of LIMA has a stricter license than CC BY-NC-SA, the LIMA dataset follows the same. Otherwise, it follows the CC BY-NC-SA license", [URL](https://huggingface.co/datasets/GAIR/lima#license) |
 | OpenWeb Text | Pretraining | 8,013,769 documents | [URL](https://github.com/jcpeterson/openwebtext)                | [URL](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) | Unspecified                                                                                                                                                                                                      |
-| RedPajama    | Pretraining | 1.2 T tokens        | [URL](https://github.com/togethercomputer/RedPajama-Data)       | [URL](https://together.ai/blog/redpajama-models-v1)                                                                       | Subset-dependent, [URL](https://github.com/togethercomputer/RedPajama-Data#license)                                                                                                                              |
+| RedPajama    | Pretraining | 1.2 T tokens        | [URL](https://github.com/togethercomputer/RedPajama-Data)       | [URL](https://together.ai/blog/redpajama-models-v1)                                                                       | Subset-dependent, [URL](https://github.com/togethercomputer/RedPajama-Data#license)                                                                                                                              |                                                                     |   |
 
 &nbsp;
 
@@ -139,7 +139,7 @@ The more detailed dataset composition is as follows based on a table taken from
 |                        | BEA-GEC        | 1,203                  |
 |                        | Enron          | 372                    |
 | **Total**              |                | 27,739                 |
-|                        |                |                        |
+|  |   |  |
 | **Train**              |                | 23,652                 |
 | **Validation**         |                | 2,042                  |
 | **Test**               |                | 2,045                  |
diff --git a/tutorials/resource-tables.md b/tutorials/resource-tables.md
index 67e9b37039..62eca9184c 100644
--- a/tutorials/resource-tables.md
+++ b/tutorials/resource-tables.md
@@ -39,35 +39,35 @@ Note that the number of tokens in the training set does not affect the supported
 
 The following experiments were conducted on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script.
 
-| Size  | Model          | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) |
-|-------|----------------|--------------|-----------------|----------------------|-------------|--------------------|------------------------------|
-| 1.3 B | phi-1.5        | None         | 1               | 1,572,864            | 4.82 GB     | 1.62 min           | 80.91 min                    |
-| 1.3 B | phi-1.5        | bnb.nf4      | 1               | 1,572,864            | 3.78 GB     | 1.77 min           | 88.36 min                    |
-| 1.3 B | phi-1.5        | bnb.nf4-dq   | 1               | 1,572,864            | 3.72 GB     | 1.87 min           | 93.39 min                    |
-| 1.3 B | phi-1.5        | None         | 2               | 1,572,864            | 6.76 GB     | 1.65 min           | 82.44 min                    |
-| 1.3 B | phi-1.5        | None         | 4               | 1,572,864            | 10.68 GB    | 1.70 min           | 84.79 min                    |
-|       |                |              |                 |                      |             |                    |                              |
+| Size  | Model         | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) |
+| ----- | ------------- | ------------ | --------------- | -------------------- | ----------- | ------------------ | ---------------------------- |
+| 1.3 B | phi-1.5       | None         | 1               | 1,572,864            | 4.82 GB     | 1.62 min           | 80.91 min                    |
+| 1.3 B | phi-1.5       | bnb.nf4      | 1               | 1,572,864            | 3.78 GB     | 1.77 min           | 88.36 min                    |
+| 1.3 B | phi-1.5       | bnb.nf4-dq   | 1               | 1,572,864            | 3.72 GB     | 1.87 min           | 93.39 min                    |
+| 1.3 B | phi-1.5       | None         | 2               | 1,572,864            | 6.76 GB     | 1.65 min           | 82.44 min                    |
+| 1.3 B | phi-1.5       | None         | 4               | 1,572,864            | 10.68 GB    | 1.70 min           | 84.79 min                    |
+|       |               |              |                 |                      |             |                    |                              |
 | 3 B   | StableLM Alpha | None         | 1               | 2,097,152            | 9.69 GB     | 1.24 min           | 62.23 min                    |
 | 3 B   | StableLM Alpha | bnb.nf4      | 1               | 2,097,152            | 6.35 GB     | 1.82 min           | 91.22 min                    |
 | 3 B   | StableLM Alpha | bnb.nf4-dq   | 1               | 2,097,152            | 6.19 GB     | 1.87 min           | 93.58 min                    |
 | 3 B   | StableLM Alpha | None         | 2               | 2,097,152            | 12.10 GB    | 1.33 min           | 66.68 min                    |
 | 3 B   | StableLM Alpha | None         | 4               | 2,097,152            | 16.92 GB    | 1.50 min           | 74.89 min                    |
-|       |                |              |                 |                      |             |                    |                              |
-| 7 B   | Llama 2        | None         | 1               | 4,194,304            | 21.30 GB    | 2.36 min           | 118.03 min                   |
-| 7 B   | Llama 2        | bnb.nf4      | 1               | 4,194,304            | 14.14 GB    | 3.68 min           | 183.88 min                   |
-| 7 B   | Llama 2        | bnb.nf4-dq   | 1               | 4,194,304            | 13.84 GB    | 3.83 min           | 191.66 min                   |
-| 7 B   | Llama 2        | None         | 2               | 4,194,304            | 29.07 GB    | 2.52 min           | 125.97 min                   |
-| 7 B   | Llama 2        | None         | 4               | 4,194,304            | OOM         | -                  | -                            |
-|       |                |              |                 |                      |             |                    |                              |
-| 13 B  | Llama 2        | None         | 1               | 6,553,600            | 38.12 GB    | 3.19 min           | 159.43 min                   |
-| 13 B  | Llama 2        | bnb.nf4      | 1               | 6,553,600            | 23.14 GB    | 6.38 min           | 319.03 min                   |
-| 13 B  | Llama 2        | bnb.nf4-dq   | 1               | 6,553,600            | 22.55 GB    | 6.55 min           | 327.32 min                   |
-| 13 B  | Llama 2        | None         | 2               | 6,553,600            | OOM         | -                  | -                            |
-| 13 B  | Llama 2        | None         | 4               | 6,553,600            | OOM         | -                  | -                            |
-|       |                |              |                 |                      |             |                    |                              |
-| 40 B  | Falcon         | None         | 1               | 12,042,240           | OOM         | -                  | -                            |
-| 40 B  | Falcon         | bnb.nf4      | 1               | 12,042,240           | OOM         | -                  | -                            |
-| 40 B  | Falcon         | bnb.nf4-dq   | 1               | 12,042,240           | OOM         | -                  | -                            |
+|       |               |              |                 |                      |             |                    |                              |
+| 7 B   | Llama 2       | None         | 1               | 4,194,304            | 21.30 GB    | 2.36 min           | 118.03 min                   |
+| 7 B   | Llama 2       | bnb.nf4      | 1               | 4,194,304            | 14.14 GB    | 3.68 min           | 183.88 min                   |
+| 7 B   | Llama 2       | bnb.nf4-dq   | 1               | 4,194,304            | 13.84 GB    | 3.83 min           | 191.66 min                   |
+| 7 B   | Llama 2       | None         | 2               | 4,194,304            | 29.07 GB    | 2.52 min           | 125.97 min                   |
+| 7 B   | Llama 2       | None         | 4               | 4,194,304            | OOM         | -                  | -                            |
+|       |               |              |                 |                      |             |                    |                              |
+| 13 B  | Llama 2       | None         | 1               | 6,553,600            | 38.12 GB    | 3.19 min           | 159.43 min                   |
+| 13 B  | Llama 2       | bnb.nf4      | 1               | 6,553,600            | 23.14 GB    | 6.38 min           | 319.03 min                   |
+| 13 B  | Llama 2       | bnb.nf4-dq   | 1               | 6,553,600            | 22.55 GB    | 6.55 min           | 327.32 min                   |
+| 13 B  | Llama 2       | None         | 2               | 6,553,600            | OOM         | -                  | -                            |
+| 13 B  | Llama 2       | None         | 4               | 6,553,600            | OOM         | -                  | -                            |
+|       |               |              |                 |                      |             |                    |                              |
+| 40 B  | Falcon        | None         | 1               | 12,042,240           | OOM         | -                  | -                            |
+| 40 B  | Falcon        | bnb.nf4      | 1               | 12,042,240           | OOM         | -                  | -                            |
+| 40 B  | Falcon        | bnb.nf4-dq   | 1               | 12,042,240           | OOM         | -                  | -                            |
 
 &nbsp;
 
@@ -75,35 +75,35 @@ The following experiments were conducted on 1xA100 with a minibatch size of 128
 
 The following experiments were conducted on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script.
 
-| Size  | Model          | Quantization | Microbatch size | Trainable parameters | GPU      | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) |
-|-------|----------------|--------------|-----------------|----------------------|----------|-------------|--------------------|------------------------------|
-| 1.3 B | phi-1.5        | None         | 1               | 1,572,864            | 2 x A100 | 4.86 GB     | 3.81 min           | 190.47 min                   |
-| 1.3 B | phi-1.5        | bnb.nf4      | 1               | 1,572,864            | 2 x A100 | N/A         | -                  | -                            |
-| 1.3 B | phi-1.5        | bnb.nf4-dq   | 1               | 1,572,864            | 2 x A100 | N/A         | -                  | -                            |
-| 1.3 B | phi-1.5        | None         | 2               | 1,572,864            | 2 x A100 | 5.05 GB     | 3.63 min           | 181.31 min                   |
-| 1.3 B | phi-1.5        | None         | 4               | 1,572,864            | 2 x A100 | 5.88 GB     | 3.64 min           | 181.76 min                   |
-|       |                |              |                 |                      |          |             |                    |                              |
+| Size  | Model         | Quantization | Microbatch size | Trainable parameters | GPU      | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) |
+| ----- | ------------- | ------------ | --------------- | -------------------- | -------- | ----------- | ------------------ | ---------------------------- |
+| 1.3 B | phi-1.5       | None         | 1               | 1,572,864            | 2 x A100 | 4.86 GB     | 3.81 min           | 190.47 min                   |
+| 1.3 B | phi-1.5       | bnb.nf4      | 1               | 1,572,864            | 2 x A100 | N/A         | -                  | -                            |
+| 1.3 B | phi-1.5       | bnb.nf4-dq   | 1               | 1,572,864            | 2 x A100 | N/A         | -                  | -                            |
+| 1.3 B | phi-1.5       | None         | 2               | 1,572,864            | 2 x A100 | 5.05 GB     | 3.63 min           | 181.31 min                   |
+| 1.3 B | phi-1.5       | None         | 4               | 1,572,864            | 2 x A100 | 5.88 GB     | 3.64 min           | 181.76 min                   |
+|       |               |              |                 |                      |          |             |                    |                              |
 | 3 B   | StableLM Alpha | None         | 1               | 2,097,152            | 2 x A100 | 12.75 GB    | 2.92 min           | 145.96 min                   |
 | 3 B   | StableLM Alpha | None         | 2               | 2,097,152            | 2 x A100 | 12.94 GB    | 3.06 min           | 153.10 min                   |
 | 3 B   | StableLM Alpha | None         | 4               | 2,097,152            | 2 x A100 | 13.45 GB    | 3.86 min           | 192.99 min                   |
-|       |                |              |                 |                      |          |             | -                  | -                            |
-| 7 B   | Llama 2        | None         | 1               | 4,194,304            | 2 x A100 | 22.18 GB    | 5.93 min           | 296.62 min                   |
-| 7 B   | Llama 2        | None         | 2               | 4,194,304            | 2 x A100 | 22.47 GB    | 6.48 min           | 324.03 min                   |
-| 7 B   | Llama 2        | None         | 4               | 4,194,304            | 2 x A100 | 23.39 GB    | 8.66 min           | 432.82 min                   |
-|       |                |              |                 |                      |          |             |                    |                              |
-| 13 B  | Llama 2        | None         | 1               | 6,553,600            | 2 x A100 | OOM         | -                  | -                            |
-| 13 B  | Llama 2        | bnb.nf4      | 1               | 6,553,600            | 2 x A100 | N/A         | -                  | -                            |
-| 13 B  | Llama 2        | bnb.nf4-dq   | 1               | 6,553,600            | 2 x A100 | N/A         | -                  | -                            |
-|       |                |              |                 |                      |          |             |                    |                              |
-| 13 B  | Llama 2        | None         | 1               | 6,553,600            | 4 x A100 | 35.57 GB    | 10.25 min          | 512.5 min                    |
-| 40 B  | Falcon         | None         | 1               | 12,042,240           | 4 x A100 | OOM         | -                  | -                            |
+|       |               |              |                 |                      |          |             | -                  | -                            |
+| 7 B   | Llama 2       | None         | 1               | 4,194,304            | 2 x A100 | 22.18 GB    | 5.93 min           | 296.62 min                   |
+| 7 B   | Llama 2       | None         | 2               | 4,194,304            | 2 x A100 | 22.47 GB    | 6.48 min           | 324.03 min                   |
+| 7 B   | Llama 2       | None         | 4               | 4,194,304            | 2 x A100 | 23.39 GB    | 8.66 min           | 432.82 min                   |
+|       |               |              |                 |                      |          |             |                    |                              |
+| 13 B  | Llama 2       | None         | 1               | 6,553,600            | 2 x A100 | OOM         | -                  | -                            |
+| 13 B  | Llama 2       | bnb.nf4      | 1               | 6,553,600            | 2 x A100 | N/A         | -                  | -                            |
+| 13 B  | Llama 2       | bnb.nf4-dq   | 1               | 6,553,600            | 2 x A100 | N/A         | -                  | -                            |
+|       |               |              |                 |                      |          |             |                    |                              |
+| 13 B  | Llama 2       | None         | 1               | 6,553,600            | 4 x A100 | 35.57 GB    | 10.25 min          | 512.5 min                    |
+| 40 B  | Falcon        | None         | 1               | 12,042,240           | 4 x A100 | OOM         | -                  | -                            |
 
 &nbsp;
 
 ## Single-GPU Inference
 
-| Size  | Model          | Quantization | GPU      | Max GPU RAM                               | Token/sec |
-|-------|----------------|--------------|----------|-------------------------------------------|-----------|
+| Size  | Model          | Quantization | GPU      | Max GPU RAM                                | Token/sec |
+|-------|----------------|--------------|----------|--------------------------------------------|-----------|
 | 1.3 B | phi-1.5        | None         | 1 x A100 | 2.86 GB                                   | 42.56     |
 | 1.3 B | phi-1.5        | bnb.nf4      | 1 x A100 | 1.39 GB                                   | 22.89     |
 | 1.3 B | phi-1.5        | bnb.nf4-dq   | 1 x A100 | 1.33 GB                                   | 22.75     |
@@ -113,28 +113,28 @@ The following experiments were conducted on multiple A100 GPUs with a minibatch
 | 3 B   | StableLM Alpha | bnb.nf4      | 1 x A100 | 3.20 GB                                   | 29.04     |
 | 3 B   | StableLM Alpha | bnb.nf4-dq   | 1 x A100 | 3.04 GB                                   | 27.15     |
 | 3 B   | StableLM Alpha | gptq.int4    | 1 x A100 | 2.43 GB                                   | 5.9       |
-|       |                |              |          |                                           |           |
-| 7 B   | Llama 2        | None         | 1 x A100 | 13.52 GB                                  | 30.97     |
-| 7 B   | Llama 2        | bnb.nf4      | 1 x A100 | 4.57 GB                                   | 19.98     |
-| 7 B   | Llama 2        | bnb.nf4-dq   | 1 x A100 | 4.26 GB                                   | 17.3      |
-| 7 B   | Llama 2        | gptq.int4    | 1 x A100 | 3.93 GB                                   | 5.04      |
-|       |                |              |          |                                           |           |
-| 13 B  | Llama 2        | None         | 1 x A100 | 26.21 GB                                  | 24.82     |
-| 13 B  | Llama 2        | bnb.nf4      | 1 x A100 | 8.32 GB                                   | 16.73     |
-| 13 B  | Llama 2        | bnb.nf4-dq   | 1 x A100 | 7.72 GB                                   | 14.43     |
-| 13 B  | Llama 2        | gptq.int4    | 1 x A100 | 7.14 GB                                   | 4.17      |
-|       |                |              |          |                                           |           |
-| 34 B  | CodeLlama      | None         | 1 x A100 | OOM                                       | -         |
-| 34 B  | CodeLlama      | bnb.nf4      | 1 x A100 | 20.52 GB                                  | 14.32     |
-| 34 B  | CodeLlama      | bnb.nf4-dq   | 1 x A100 | 18.95 GB                                  | 12.37     |
-| 34 B  | CodeLlama      | gptq.int4    | 1 x A100 | OOM (quantize script)                     | -         |
-|       |                |              |          |                                           |           |
-| 40 B  | Falcon         | None         | 1 x A100 | OOM                                       | -         |
-| 40 B  | Falcon         | bnb.nf4      | 1 x A100 | 26.55 GB                                  | 13.25     |
-| 40 B  | Falcon         | bnb.nf4-dq   | 1 x A100 | 24.63 GB                                  | 11.64     |
-| 40 B  | Falcon         | gptq.int4    | 1 x A100 | OOM (quantize script)                     | -         |
-|       |                |              |          |                                           |           |
-| 70 B  | Llama 2        | None         | 1 x A100 | OOM                                       | -         |
-| 70 B  | Llama 2        | bnb.nf4      | 1 x A100 | CUDA error: CUBLAS_STATUS_NOT_INITIALIZED | -         |
-| 70 B  | Llama 2        | bnb.nf4-dq   | 1 x A100 | 37.21 GB                                  | 7.97      |
-| 70 B  | Llama 2        | gptq.int4    | 1 x A100 | OOM (quantize script)                     | -         |
+|       |               |              |          |                                           |           |
+| 7 B   | Llama 2       | None         | 1 x A100 | 13.52 GB                                  | 30.97     |
+| 7 B   | Llama 2       | bnb.nf4      | 1 x A100 | 4.57 GB                                   | 19.98     |
+| 7 B   | Llama 2       | bnb.nf4-dq   | 1 x A100 | 4.26 GB                                   | 17.3      |
+| 7 B   | Llama 2       | gptq.int4    | 1 x A100 | 3.93 GB                                   | 5.04      |
+|       |               |              |          |                                           |           |
+| 13 B  | Llama 2       | None         | 1 x A100 | 26.21 GB                                  | 24.82     |
+| 13 B  | Llama 2       | bnb.nf4      | 1 x A100 | 8.32 GB                                   | 16.73     |
+| 13 B  | Llama 2       | bnb.nf4-dq   | 1 x A100 | 7.72 GB                                   | 14.43     |
+| 13 B  | Llama 2       | gptq.int4    | 1 x A100 | 7.14 GB                                   | 4.17      |
+|       |               |              |          |                                           |           |
+| 34 B  | CodeLlama     | None         | 1 x A100 | OOM                                       | -         |
+| 34 B  | CodeLlama     | bnb.nf4      | 1 x A100 | 20.52 GB                                  | 14.32     |
+| 34 B  | CodeLlama     | bnb.nf4-dq   | 1 x A100 | 18.95 GB                                  | 12.37     |
+| 34 B  | CodeLlama     | gptq.int4    | 1 x A100 | OOM (quantize script)                     | -         |
+|       |               |              |          |                                           |           |
+| 40 B  | Falcon        | None         | 1 x A100 | OOM                                       | -         |
+| 40 B  | Falcon        | bnb.nf4      | 1 x A100 | 26.55 GB                                  | 13.25     |
+| 40 B  | Falcon        | bnb.nf4-dq   | 1 x A100 | 24.63 GB                                  | 11.64     |
+| 40 B  | Falcon        | gptq.int4    | 1 x A100 | OOM (quantize script)                     | -         |
+|       |               |              |          |                                           |           |
+| 70 B  | Llama 2       | None         | 1 x A100 | OOM                                       | -         |
+| 70 B  | Llama 2       | bnb.nf4      | 1 x A100 | CUDA error: CUBLAS_STATUS_NOT_INITIALIZED | -         |
+| 70 B  | Llama 2       | bnb.nf4-dq   | 1 x A100 | 37.21 GB                                  | 7.97      |
+| 70 B  | Llama 2       | gptq.int4    | 1 x A100 | OOM (quantize script)                     | -         |

From daf0e4eb2a44636675eed7ab579f80369c8a7dd5 Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Tue, 21 Nov 2023 13:13:56 +0300
Subject: [PATCH 11/14] Revert "Fix typo"

This reverts commit e05c9b4783366dcd7196aefddddc5b280af4b022.
---
 tutorials/resource-tables.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tutorials/resource-tables.md b/tutorials/resource-tables.md
index 62eca9184c..b8b18913ea 100644
--- a/tutorials/resource-tables.md
+++ b/tutorials/resource-tables.md
@@ -37,7 +37,7 @@ Note that the number of tokens in the training set does not affect the supported
 
 ## Finetuning with LoRA on 1 GPU
 
-The following experiments were conducted on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script.
+The following experiments were conducated on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script.
 
 | Size  | Model         | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) |
 | ----- | ------------- | ------------ | --------------- | -------------------- | ----------- | ------------------ | ---------------------------- |
@@ -73,7 +73,7 @@ The following experiments were conducted on 1xA100 with a minibatch size of 128
 
 ## Finetuning with LoRA on Multiple GPUs
 
-The following experiments were conducted on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script.
+The following experiments were conducated on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script.
 
 | Size  | Model         | Quantization | Microbatch size | Trainable parameters | GPU      | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) |
 | ----- | ------------- | ------------ | --------------- | -------------------- | -------- | ----------- | ------------------ | ---------------------------- |

From 652d7bdeca690911629d9245ab268042828edeee Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Tue, 21 Nov 2023 13:14:21 +0300
Subject: [PATCH 12/14] Revert "Revert "Minor tutorial updates""

This reverts commit 445af8ba970f7b652dad84582a8bd9a0d1c640af.
---
 tutorials/inference.md       |   4 +-
 tutorials/prepare_dataset.md |   6 +-
 tutorials/resource-tables.md | 140 +++++++++++++++++------------------
 3 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/tutorials/inference.md b/tutorials/inference.md
index bb21cef4db..91366bab87 100644
--- a/tutorials/inference.md
+++ b/tutorials/inference.md
@@ -41,7 +41,7 @@ For instance, `falcon-40b` would require ~80 GB of GPU memory to run on a single
 python generate/base.py --checkpoint_dir checkpoints/tiiuae/falcon-40b --strategy fsdp --devices 4
 ```
 
-Which will take 32 GB of memory, and run at 0.37 tokens/sec.
+Which will take ~25 GB of memory, and run at 2.5 tokens/sec.
 
 Or to reduce the memory requirements even further, you can try using CPU offloading. For that, you will need to manually edit the `cpu_offload=False` parameter in the file and set it to `True`.
 
@@ -51,5 +51,5 @@ Now we can run it on just 2 devices.
 python generate/base.py --checkpoint_dir checkpoints/tiiuae/falcon-40b --strategy fsdp --devices 2
 ```
 
-taking 13 GB of memory but running at 0.12 tokens/sec on 2 A100 40GB GPUs.
+taking ~5 GB of memory but running at 0.23 tokens/sec on 2 A100 40GB GPUs.
 Smaller devices like 3090s (24 GB) can also fit it with this technique.
diff --git a/tutorials/prepare_dataset.md b/tutorials/prepare_dataset.md
index f21d16b87c..77af9fc75f 100644
--- a/tutorials/prepare_dataset.md
+++ b/tutorials/prepare_dataset.md
@@ -8,10 +8,10 @@ Below is a table of all datasets that are currently supported in Lit-GPT:
 | Alpaca       | Finetuning  | 51,759 samples      | [URL](https://github.com/tatsu-lab/stanford_alpaca)             | [URL](https://crfm.stanford.edu/2023/03/13/alpaca.html)                                                                   | Attribution-NonCommercial 4.0 International, [ URL](https://crfm.stanford.edu/2023/03/13/alpaca.html)                                                                                                            |
 | Alpaca Libre | Finetuning  | 55,370 samples      | [URL](https://github.com/mobarski/alpaca-libre)                 | -                                                                                                                         | CC0/MIT,  [URL](https://github.com/mobarski/alpaca-libre)                                                                                                                                                        |
 | Dolly        | Finetuning  | 15,011 samples      | [URL](https://github.com/databrickslabs/dolly/tree/master/data) | [URL](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm)              | CC-BY-SA, [URL](https://github.com/databrickslabs/dolly#model-overview)                                                                                                                                          |
-| LongForm     | Finetuning  | 23,652 samples      | [URL](https://github.com/akoksal/LongForm)                      | [URL](https://arxiv.org/abs/2304.08460)                                                                                   | No information provided and subset-dependent, [URL](https://github.com/akoksal/LongForm) |
+| LongForm     | Finetuning  | 23,652 samples      | [URL](https://github.com/akoksal/LongForm)                      | [URL](https://arxiv.org/abs/2304.08460)                                                                                   | No information provided and subset-dependent, [URL](https://github.com/akoksal/LongForm)                                                                                                                         |
 | LIMA         | Finetuning  | 1,084 samples       | [URL](https://huggingface.co/datasets/GAIR/lima)                | [URL](https://arxiv.org/abs/2305.11206)                                                                                   | "If the source data of LIMA has a stricter license than CC BY-NC-SA, the LIMA dataset follows the same. Otherwise, it follows the CC BY-NC-SA license", [URL](https://huggingface.co/datasets/GAIR/lima#license) |
 | OpenWeb Text | Pretraining | 8,013,769 documents | [URL](https://github.com/jcpeterson/openwebtext)                | [URL](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) | Unspecified                                                                                                                                                                                                      |
-| RedPajama    | Pretraining | 1.2 T tokens        | [URL](https://github.com/togethercomputer/RedPajama-Data)       | [URL](https://together.ai/blog/redpajama-models-v1)                                                                       | Subset-dependent, [URL](https://github.com/togethercomputer/RedPajama-Data#license)                                                                                                                              |                                                                     |   |
+| RedPajama    | Pretraining | 1.2 T tokens        | [URL](https://github.com/togethercomputer/RedPajama-Data)       | [URL](https://together.ai/blog/redpajama-models-v1)                                                                       | Subset-dependent, [URL](https://github.com/togethercomputer/RedPajama-Data#license)                                                                                                                              |
 
 &nbsp;
 
@@ -139,7 +139,7 @@ The more detailed dataset composition is as follows based on a table taken from
 |                        | BEA-GEC        | 1,203                  |
 |                        | Enron          | 372                    |
 | **Total**              |                | 27,739                 |
-|  |   |  |
+|                        |                |                        |
 | **Train**              |                | 23,652                 |
 | **Validation**         |                | 2,042                  |
 | **Test**               |                | 2,045                  |
diff --git a/tutorials/resource-tables.md b/tutorials/resource-tables.md
index b8b18913ea..193684f6b8 100644
--- a/tutorials/resource-tables.md
+++ b/tutorials/resource-tables.md
@@ -39,35 +39,35 @@ Note that the number of tokens in the training set does not affect the supported
 
 The following experiments were conducated on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script.
 
-| Size  | Model         | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) |
-| ----- | ------------- | ------------ | --------------- | -------------------- | ----------- | ------------------ | ---------------------------- |
-| 1.3 B | phi-1.5       | None         | 1               | 1,572,864            | 4.82 GB     | 1.62 min           | 80.91 min                    |
-| 1.3 B | phi-1.5       | bnb.nf4      | 1               | 1,572,864            | 3.78 GB     | 1.77 min           | 88.36 min                    |
-| 1.3 B | phi-1.5       | bnb.nf4-dq   | 1               | 1,572,864            | 3.72 GB     | 1.87 min           | 93.39 min                    |
-| 1.3 B | phi-1.5       | None         | 2               | 1,572,864            | 6.76 GB     | 1.65 min           | 82.44 min                    |
-| 1.3 B | phi-1.5       | None         | 4               | 1,572,864            | 10.68 GB    | 1.70 min           | 84.79 min                    |
-|       |               |              |                 |                      |             |                    |                              |
+| Size  | Model          | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) |
+|-------|----------------|--------------|-----------------|----------------------|-------------|--------------------|------------------------------|
+| 1.3 B | phi-1.5        | None         | 1               | 1,572,864            | 4.82 GB     | 1.62 min           | 80.91 min                    |
+| 1.3 B | phi-1.5        | bnb.nf4      | 1               | 1,572,864            | 3.78 GB     | 1.77 min           | 88.36 min                    |
+| 1.3 B | phi-1.5        | bnb.nf4-dq   | 1               | 1,572,864            | 3.72 GB     | 1.87 min           | 93.39 min                    |
+| 1.3 B | phi-1.5        | None         | 2               | 1,572,864            | 6.76 GB     | 1.65 min           | 82.44 min                    |
+| 1.3 B | phi-1.5        | None         | 4               | 1,572,864            | 10.68 GB    | 1.70 min           | 84.79 min                    |
+|       |                |              |                 |                      |             |                    |                              |
 | 3 B   | StableLM Alpha | None         | 1               | 2,097,152            | 9.69 GB     | 1.24 min           | 62.23 min                    |
 | 3 B   | StableLM Alpha | bnb.nf4      | 1               | 2,097,152            | 6.35 GB     | 1.82 min           | 91.22 min                    |
 | 3 B   | StableLM Alpha | bnb.nf4-dq   | 1               | 2,097,152            | 6.19 GB     | 1.87 min           | 93.58 min                    |
 | 3 B   | StableLM Alpha | None         | 2               | 2,097,152            | 12.10 GB    | 1.33 min           | 66.68 min                    |
 | 3 B   | StableLM Alpha | None         | 4               | 2,097,152            | 16.92 GB    | 1.50 min           | 74.89 min                    |
-|       |               |              |                 |                      |             |                    |                              |
-| 7 B   | Llama 2       | None         | 1               | 4,194,304            | 21.30 GB    | 2.36 min           | 118.03 min                   |
-| 7 B   | Llama 2       | bnb.nf4      | 1               | 4,194,304            | 14.14 GB    | 3.68 min           | 183.88 min                   |
-| 7 B   | Llama 2       | bnb.nf4-dq   | 1               | 4,194,304            | 13.84 GB    | 3.83 min           | 191.66 min                   |
-| 7 B   | Llama 2       | None         | 2               | 4,194,304            | 29.07 GB    | 2.52 min           | 125.97 min                   |
-| 7 B   | Llama 2       | None         | 4               | 4,194,304            | OOM         | -                  | -                            |
-|       |               |              |                 |                      |             |                    |                              |
-| 13 B  | Llama 2       | None         | 1               | 6,553,600            | 38.12 GB    | 3.19 min           | 159.43 min                   |
-| 13 B  | Llama 2       | bnb.nf4      | 1               | 6,553,600            | 23.14 GB    | 6.38 min           | 319.03 min                   |
-| 13 B  | Llama 2       | bnb.nf4-dq   | 1               | 6,553,600            | 22.55 GB    | 6.55 min           | 327.32 min                   |
-| 13 B  | Llama 2       | None         | 2               | 6,553,600            | OOM         | -                  | -                            |
-| 13 B  | Llama 2       | None         | 4               | 6,553,600            | OOM         | -                  | -                            |
-|       |               |              |                 |                      |             |                    |                              |
-| 40 B  | Falcon        | None         | 1               | 12,042,240           | OOM         | -                  | -                            |
-| 40 B  | Falcon        | bnb.nf4      | 1               | 12,042,240           | OOM         | -                  | -                            |
-| 40 B  | Falcon        | bnb.nf4-dq   | 1               | 12,042,240           | OOM         | -                  | -                            |
+|       |                |              |                 |                      |             |                    |                              |
+| 7 B   | Llama 2        | None         | 1               | 4,194,304            | 21.30 GB    | 2.36 min           | 118.03 min                   |
+| 7 B   | Llama 2        | bnb.nf4      | 1               | 4,194,304            | 14.14 GB    | 3.68 min           | 183.88 min                   |
+| 7 B   | Llama 2        | bnb.nf4-dq   | 1               | 4,194,304            | 13.84 GB    | 3.83 min           | 191.66 min                   |
+| 7 B   | Llama 2        | None         | 2               | 4,194,304            | 29.07 GB    | 2.52 min           | 125.97 min                   |
+| 7 B   | Llama 2        | None         | 4               | 4,194,304            | OOM         | -                  | -                            |
+|       |                |              |                 |                      |             |                    |                              |
+| 13 B  | Llama 2        | None         | 1               | 6,553,600            | 38.12 GB    | 3.19 min           | 159.43 min                   |
+| 13 B  | Llama 2        | bnb.nf4      | 1               | 6,553,600            | 23.14 GB    | 6.38 min           | 319.03 min                   |
+| 13 B  | Llama 2        | bnb.nf4-dq   | 1               | 6,553,600            | 22.55 GB    | 6.55 min           | 327.32 min                   |
+| 13 B  | Llama 2        | None         | 2               | 6,553,600            | OOM         | -                  | -                            |
+| 13 B  | Llama 2        | None         | 4               | 6,553,600            | OOM         | -                  | -                            |
+|       |                |              |                 |                      |             |                    |                              |
+| 40 B  | Falcon         | None         | 1               | 12,042,240           | OOM         | -                  | -                            |
+| 40 B  | Falcon         | bnb.nf4      | 1               | 12,042,240           | OOM         | -                  | -                            |
+| 40 B  | Falcon         | bnb.nf4-dq   | 1               | 12,042,240           | OOM         | -                  | -                            |
 
 &nbsp;
 
@@ -75,35 +75,35 @@ The following experiments were conducated on 1xA100 with a minibatch size of 128
 
 The following experiments were conducated on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script.
 
-| Size  | Model         | Quantization | Microbatch size | Trainable parameters | GPU      | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) |
-| ----- | ------------- | ------------ | --------------- | -------------------- | -------- | ----------- | ------------------ | ---------------------------- |
-| 1.3 B | phi-1.5       | None         | 1               | 1,572,864            | 2 x A100 | 4.86 GB     | 3.81 min           | 190.47 min                   |
-| 1.3 B | phi-1.5       | bnb.nf4      | 1               | 1,572,864            | 2 x A100 | N/A         | -                  | -                            |
-| 1.3 B | phi-1.5       | bnb.nf4-dq   | 1               | 1,572,864            | 2 x A100 | N/A         | -                  | -                            |
-| 1.3 B | phi-1.5       | None         | 2               | 1,572,864            | 2 x A100 | 5.05 GB     | 3.63 min           | 181.31 min                   |
-| 1.3 B | phi-1.5       | None         | 4               | 1,572,864            | 2 x A100 | 5.88 GB     | 3.64 min           | 181.76 min                   |
-|       |               |              |                 |                      |          |             |                    |                              |
+| Size  | Model          | Quantization | Microbatch size | Trainable parameters | GPU      | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) |
+|-------|----------------|--------------|-----------------|----------------------|----------|-------------|--------------------|------------------------------|
+| 1.3 B | phi-1.5        | None         | 1               | 1,572,864            | 2 x A100 | 4.86 GB     | 3.81 min           | 190.47 min                   |
+| 1.3 B | phi-1.5        | bnb.nf4      | 1               | 1,572,864            | 2 x A100 | N/A         | -                  | -                            |
+| 1.3 B | phi-1.5        | bnb.nf4-dq   | 1               | 1,572,864            | 2 x A100 | N/A         | -                  | -                            |
+| 1.3 B | phi-1.5        | None         | 2               | 1,572,864            | 2 x A100 | 5.05 GB     | 3.63 min           | 181.31 min                   |
+| 1.3 B | phi-1.5        | None         | 4               | 1,572,864            | 2 x A100 | 5.88 GB     | 3.64 min           | 181.76 min                   |
+|       |                |              |                 |                      |          |             |                    |                              |
 | 3 B   | StableLM Alpha | None         | 1               | 2,097,152            | 2 x A100 | 12.75 GB    | 2.92 min           | 145.96 min                   |
 | 3 B   | StableLM Alpha | None         | 2               | 2,097,152            | 2 x A100 | 12.94 GB    | 3.06 min           | 153.10 min                   |
 | 3 B   | StableLM Alpha | None         | 4               | 2,097,152            | 2 x A100 | 13.45 GB    | 3.86 min           | 192.99 min                   |
-|       |               |              |                 |                      |          |             | -                  | -                            |
-| 7 B   | Llama 2       | None         | 1               | 4,194,304            | 2 x A100 | 22.18 GB    | 5.93 min           | 296.62 min                   |
-| 7 B   | Llama 2       | None         | 2               | 4,194,304            | 2 x A100 | 22.47 GB    | 6.48 min           | 324.03 min                   |
-| 7 B   | Llama 2       | None         | 4               | 4,194,304            | 2 x A100 | 23.39 GB    | 8.66 min           | 432.82 min                   |
-|       |               |              |                 |                      |          |             |                    |                              |
-| 13 B  | Llama 2       | None         | 1               | 6,553,600            | 2 x A100 | OOM         | -                  | -                            |
-| 13 B  | Llama 2       | bnb.nf4      | 1               | 6,553,600            | 2 x A100 | N/A         | -                  | -                            |
-| 13 B  | Llama 2       | bnb.nf4-dq   | 1               | 6,553,600            | 2 x A100 | N/A         | -                  | -                            |
-|       |               |              |                 |                      |          |             |                    |                              |
-| 13 B  | Llama 2       | None         | 1               | 6,553,600            | 4 x A100 | 35.57 GB    | 10.25 min          | 512.5 min                    |
-| 40 B  | Falcon        | None         | 1               | 12,042,240           | 4 x A100 | OOM         | -                  | -                            |
+|       |                |              |                 |                      |          |             | -                  | -                            |
+| 7 B   | Llama 2        | None         | 1               | 4,194,304            | 2 x A100 | 22.18 GB    | 5.93 min           | 296.62 min                   |
+| 7 B   | Llama 2        | None         | 2               | 4,194,304            | 2 x A100 | 22.47 GB    | 6.48 min           | 324.03 min                   |
+| 7 B   | Llama 2        | None         | 4               | 4,194,304            | 2 x A100 | 23.39 GB    | 8.66 min           | 432.82 min                   |
+|       |                |              |                 |                      |          |             |                    |                              |
+| 13 B  | Llama 2        | None         | 1               | 6,553,600            | 2 x A100 | OOM         | -                  | -                            |
+| 13 B  | Llama 2        | bnb.nf4      | 1               | 6,553,600            | 2 x A100 | N/A         | -                  | -                            |
+| 13 B  | Llama 2        | bnb.nf4-dq   | 1               | 6,553,600            | 2 x A100 | N/A         | -                  | -                            |
+|       |                |              |                 |                      |          |             |                    |                              |
+| 13 B  | Llama 2        | None         | 1               | 6,553,600            | 4 x A100 | 35.57 GB    | 10.25 min          | 512.5 min                    |
+| 40 B  | Falcon         | None         | 1               | 12,042,240           | 4 x A100 | OOM         | -                  | -                            |
 
 &nbsp;
 
 ## Single-GPU Inference
 
-| Size  | Model          | Quantization | GPU      | Max GPU RAM                                | Token/sec |
-|-------|----------------|--------------|----------|--------------------------------------------|-----------|
+| Size  | Model          | Quantization | GPU      | Max GPU RAM                               | Token/sec |
+|-------|----------------|--------------|----------|-------------------------------------------|-----------|
 | 1.3 B | phi-1.5        | None         | 1 x A100 | 2.86 GB                                   | 42.56     |
 | 1.3 B | phi-1.5        | bnb.nf4      | 1 x A100 | 1.39 GB                                   | 22.89     |
 | 1.3 B | phi-1.5        | bnb.nf4-dq   | 1 x A100 | 1.33 GB                                   | 22.75     |
@@ -113,28 +113,28 @@ The following experiments were conducated on multiple A100 GPUs with a minibatch
 | 3 B   | StableLM Alpha | bnb.nf4      | 1 x A100 | 3.20 GB                                   | 29.04     |
 | 3 B   | StableLM Alpha | bnb.nf4-dq   | 1 x A100 | 3.04 GB                                   | 27.15     |
 | 3 B   | StableLM Alpha | gptq.int4    | 1 x A100 | 2.43 GB                                   | 5.9       |
-|       |               |              |          |                                           |           |
-| 7 B   | Llama 2       | None         | 1 x A100 | 13.52 GB                                  | 30.97     |
-| 7 B   | Llama 2       | bnb.nf4      | 1 x A100 | 4.57 GB                                   | 19.98     |
-| 7 B   | Llama 2       | bnb.nf4-dq   | 1 x A100 | 4.26 GB                                   | 17.3      |
-| 7 B   | Llama 2       | gptq.int4    | 1 x A100 | 3.93 GB                                   | 5.04      |
-|       |               |              |          |                                           |           |
-| 13 B  | Llama 2       | None         | 1 x A100 | 26.21 GB                                  | 24.82     |
-| 13 B  | Llama 2       | bnb.nf4      | 1 x A100 | 8.32 GB                                   | 16.73     |
-| 13 B  | Llama 2       | bnb.nf4-dq   | 1 x A100 | 7.72 GB                                   | 14.43     |
-| 13 B  | Llama 2       | gptq.int4    | 1 x A100 | 7.14 GB                                   | 4.17      |
-|       |               |              |          |                                           |           |
-| 34 B  | CodeLlama     | None         | 1 x A100 | OOM                                       | -         |
-| 34 B  | CodeLlama     | bnb.nf4      | 1 x A100 | 20.52 GB                                  | 14.32     |
-| 34 B  | CodeLlama     | bnb.nf4-dq   | 1 x A100 | 18.95 GB                                  | 12.37     |
-| 34 B  | CodeLlama     | gptq.int4    | 1 x A100 | OOM (quantize script)                     | -         |
-|       |               |              |          |                                           |           |
-| 40 B  | Falcon        | None         | 1 x A100 | OOM                                       | -         |
-| 40 B  | Falcon        | bnb.nf4      | 1 x A100 | 26.55 GB                                  | 13.25     |
-| 40 B  | Falcon        | bnb.nf4-dq   | 1 x A100 | 24.63 GB                                  | 11.64     |
-| 40 B  | Falcon        | gptq.int4    | 1 x A100 | OOM (quantize script)                     | -         |
-|       |               |              |          |                                           |           |
-| 70 B  | Llama 2       | None         | 1 x A100 | OOM                                       | -         |
-| 70 B  | Llama 2       | bnb.nf4      | 1 x A100 | CUDA error: CUBLAS_STATUS_NOT_INITIALIZED | -         |
-| 70 B  | Llama 2       | bnb.nf4-dq   | 1 x A100 | 37.21 GB                                  | 7.97      |
-| 70 B  | Llama 2       | gptq.int4    | 1 x A100 | OOM (quantize script)                     | -         |
+|       |                |              |          |                                           |           |
+| 7 B   | Llama 2        | None         | 1 x A100 | 13.52 GB                                  | 30.97     |
+| 7 B   | Llama 2        | bnb.nf4      | 1 x A100 | 4.57 GB                                   | 19.98     |
+| 7 B   | Llama 2        | bnb.nf4-dq   | 1 x A100 | 4.26 GB                                   | 17.3      |
+| 7 B   | Llama 2        | gptq.int4    | 1 x A100 | 3.93 GB                                   | 5.04      |
+|       |                |              |          |                                           |           |
+| 13 B  | Llama 2        | None         | 1 x A100 | 26.21 GB                                  | 24.82     |
+| 13 B  | Llama 2        | bnb.nf4      | 1 x A100 | 8.32 GB                                   | 16.73     |
+| 13 B  | Llama 2        | bnb.nf4-dq   | 1 x A100 | 7.72 GB                                   | 14.43     |
+| 13 B  | Llama 2        | gptq.int4    | 1 x A100 | 7.14 GB                                   | 4.17      |
+|       |                |              |          |                                           |           |
+| 34 B  | CodeLlama      | None         | 1 x A100 | OOM                                       | -         |
+| 34 B  | CodeLlama      | bnb.nf4      | 1 x A100 | 20.52 GB                                  | 14.32     |
+| 34 B  | CodeLlama      | bnb.nf4-dq   | 1 x A100 | 18.95 GB                                  | 12.37     |
+| 34 B  | CodeLlama      | gptq.int4    | 1 x A100 | OOM (quantize script)                     | -         |
+|       |                |              |          |                                           |           |
+| 40 B  | Falcon         | None         | 1 x A100 | OOM                                       | -         |
+| 40 B  | Falcon         | bnb.nf4      | 1 x A100 | 26.55 GB                                  | 13.25     |
+| 40 B  | Falcon         | bnb.nf4-dq   | 1 x A100 | 24.63 GB                                  | 11.64     |
+| 40 B  | Falcon         | gptq.int4    | 1 x A100 | OOM (quantize script)                     | -         |
+|       |                |              |          |                                           |           |
+| 70 B  | Llama 2        | None         | 1 x A100 | OOM                                       | -         |
+| 70 B  | Llama 2        | bnb.nf4      | 1 x A100 | CUDA error: CUBLAS_STATUS_NOT_INITIALIZED | -         |
+| 70 B  | Llama 2        | bnb.nf4-dq   | 1 x A100 | 37.21 GB                                  | 7.97      |
+| 70 B  | Llama 2        | gptq.int4    | 1 x A100 | OOM (quantize script)                     | -         |

From 015f8683ba472d3e7fbc57f932ccf14a7f97fcfa Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Thu, 23 Nov 2023 17:03:22 +0300
Subject: [PATCH 13/14] Undo weirdly appeared typo.

---
 tutorials/resource-tables.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tutorials/resource-tables.md b/tutorials/resource-tables.md
index 193684f6b8..67e9b37039 100644
--- a/tutorials/resource-tables.md
+++ b/tutorials/resource-tables.md
@@ -37,7 +37,7 @@ Note that the number of tokens in the training set does not affect the supported
 
 ## Finetuning with LoRA on 1 GPU
 
-The following experiments were conducated on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script.
+The following experiments were conducted on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script.
 
 | Size  | Model          | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) |
 |-------|----------------|--------------|-----------------|----------------------|-------------|--------------------|------------------------------|
@@ -73,7 +73,7 @@ The following experiments were conducated on 1xA100 with a minibatch size of 128
 
 ## Finetuning with LoRA on Multiple GPUs
 
-The following experiments were conducated on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script.
+The following experiments were conducted on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script.
 
 | Size  | Model          | Quantization | Microbatch size | Trainable parameters | GPU      | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) |
 |-------|----------------|--------------|-----------------|----------------------|----------|-------------|--------------------|------------------------------|

From 1ef81f33b283505f1c18e6e30599148a563add8e Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Mon, 6 May 2024 16:26:35 +0300
Subject: [PATCH 14/14] Add FSDP test with empty_init=True

---
 tests/test_lora.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tests/test_lora.py b/tests/test_lora.py
index 431c53cfca..1ca2843a1f 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -733,3 +733,28 @@ def test_lora_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca_pa
     logs = stdout.getvalue()
     assert "of trainable parameters: 512" in logs
     assert "of non-trainable parameters: 1,888" in logs
+
+
+@RunIf(standalone=True, min_cuda_gpus=2)
+def test_lora_model_fsdp_init():
+    config = Config(
+        n_layer=1,
+        n_head=2,
+        n_embd=8,
+        block_size=8,
+        vocab_size=8,
+        lora_r=8,
+        lora_alpha=8,
+        lora_dropout=0.1,
+        lora_query=True,
+        lora_value=False,
+        lora_projection=True,
+    )
+    fabric = Fabric(devices=2, strategy="fsdp", precision="16-true")
+    fabric.launch()
+    with fabric.init_module(empty_init=True):
+        model = LoRAGPT(config)
+    x = torch.randint(0, config.padded_vocab_size, size=(2, config.block_size), dtype=torch.int64, device=fabric.device)
+    model = fabric.setup(model)
+    y = model(x)
+    assert y.shape == torch.Size([2, 8, 512])