From 3a6e637c86b6ee85c70b76fd6ebd3898e127773e Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Mon, 18 Sep 2023 15:06:38 +0300 Subject: [PATCH 01/14] Use only `index_copy` without any views or reshapes. --- lit_gpt/lora.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py index 7645582cf7..f7d0f31b4d 100644 --- a/lit_gpt/lora.py +++ b/lit_gpt/lora.py @@ -288,12 +288,9 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor: # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors # for example when we want to merge/unmerge LoRA weights and pretrained weights x = x.transpose(0, 1) - result = x.new_zeros((*x.shape[:-1], self.linear.out_features)) # (64, 64, 384) - result = result.view(-1, self.linear.out_features) # (4096, 384) - result = result.index_copy( - 1, torch.tensor(self.lora_ind, device=result.device), x.reshape(-1, sum(self.qkv_shapes)) - ) # (4096, 256) - return result.view((*x.shape[:-1], self.linear.out_features)).transpose(0, 1) # (64, 64, 384) + result = x.new_zeros(*x.shape[:-1], self.linear.out_features) # (64, 64, 384) + result.index_copy_(dim=-1, index=torch.tensor(self.lora_ind, device=result.device), source=x) # (64, 64, 384) + return result.transpose(0, 1) # (64, 64, 384) def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries. From 1d591f236a65e4b9f720a6a5ba028e0aa8214079 Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Mon, 18 Sep 2023 15:10:00 +0300 Subject: [PATCH 02/14] Don't do transpose required for the merge method in each `zero_pad` call. --- lit_gpt/lora.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py index f7d0f31b4d..013a4ed948 100644 --- a/lit_gpt/lora.py +++ b/lit_gpt/lora.py @@ -254,7 +254,7 @@ def __init__( self.reset_parameters() def zero_pad(self, x: torch.Tensor) -> torch.Tensor: - """Properly pad weight updates with zeros. + """Properly pad the last dimension of weight updates with zeros. If, based on `self.enable_lora`, we want to fine-tune queries and values, but not keys, then the weights update should be: @@ -287,10 +287,9 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor: # only for key updates (this is where self.lora_ind comes in handy) # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors # for example when we want to merge/unmerge LoRA weights and pretrained weights - x = x.transpose(0, 1) result = x.new_zeros(*x.shape[:-1], self.linear.out_features) # (64, 64, 384) result.index_copy_(dim=-1, index=torch.tensor(self.lora_ind, device=result.device), source=x) # (64, 64, 384) - return result.transpose(0, 1) # (64, 64, 384) + return result # (64, 64, 384) def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries. @@ -342,7 +341,7 @@ def merge(self): 0 ) # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128) # W = W + delta_W (merge) - self.linear.weight.data += self.zero_pad(delta_w * self.scaling) # (256, 128) after zero_pad (384, 128) + self.linear.weight.data += self.zero_pad(delta_w.T * self.scaling).T # (256, 128) after zero_pad (384, 128) self.merged = True def forward(self, x: torch.Tensor) -> torch.Tensor: From 3d068cb382cbd333b97e3ff201db5b50ddc1d832 Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Mon, 18 Sep 2023 18:11:07 +0300 Subject: [PATCH 03/14] `self.lora_ind` as a property --- lit_gpt/lora.py | 60 ++++++++++++++++++++++++++++++---------------- tests/test_lora.py | 6 ++--- 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py index 013a4ed948..43873e1540 100644 --- a/lit_gpt/lora.py +++ b/lit_gpt/lora.py @@ -233,26 +233,47 @@ def __init__( # https://github.com/cloneofsimo/lora self.scaling = self.lora_alpha / self.r - # Compute the indices - # Indices are needed to properly pad weight updates with zeros. If we want to fine-tune queries and values, - # but not keys, then the weights update should be: - # - # [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,], - # [....................................], - # [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,]] - # ↑ ↑ ↑ - # ________________________________________ - # | query | key | value | - # ---------------------------------------- - self.lora_ind = [] - if enable_q: - self.lora_ind.extend(range(0, self.linear.in_features)) - if enable_k: - self.lora_ind.extend(range(self.linear.in_features, self.linear.in_features + self.kv_embd_size)) - if enable_v: - self.lora_ind.extend(range(self.linear.in_features + self.kv_embd_size, self.linear.out_features)) self.reset_parameters() + @property + def lora_ind(self) -> torch.Tensor: + # Compute the indices + # Indices are needed to properly pad weight updates with zeros. If we want to fine-tune queries and values, + # but not keys, then the weights update should be: + # + # [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,], + # [....................................], + # [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,]] + # ↑ ↑ ↑ + # ________________________________________ + # | query | key | value | + # ---------------------------------------- + if hasattr(self, "_lora_ind"): + return self._lora_ind + + indices = [] + enable_q, enable_k, enable_v = self.enable_lora + if enable_q: + indices.append(torch.arange(0, self.linear.in_features, device=self.linear.weight.device)) + if enable_k: + indices.append( + torch.arange( + self.linear.in_features, + self.linear.in_features + self.kv_embd_size, + device=self.linear.weight.device, + ) + ) + if enable_v: + indices.append( + torch.arange( + self.linear.in_features + self.kv_embd_size, + self.linear.out_features, + device=self.linear.weight.device, + ) + ) + self.register_buffer("_lora_ind", torch.cat(indices), persistent=False) + return self._lora_ind + def zero_pad(self, x: torch.Tensor) -> torch.Tensor: """Properly pad the last dimension of weight updates with zeros. @@ -288,8 +309,7 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor: # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors # for example when we want to merge/unmerge LoRA weights and pretrained weights result = x.new_zeros(*x.shape[:-1], self.linear.out_features) # (64, 64, 384) - result.index_copy_(dim=-1, index=torch.tensor(self.lora_ind, device=result.device), source=x) # (64, 64, 384) - return result # (64, 64, 384) + return result.index_copy(dim=-1, index=self.lora_ind.clone(), source=x) # (64, 64, 384) def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries. diff --git a/tests/test_lora.py b/tests/test_lora.py index 38ac1655cd..8eff27bc92 100644 --- a/tests/test_lora.py +++ b/tests/test_lora.py @@ -92,7 +92,7 @@ def test_lora_mqa_gqa(): assert attn.linear.weight.shape == (24, 8) assert attn.lora_A.shape == (4, 8) assert attn.lora_B.shape == (16, 2) - assert attn.lora_ind == [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] + assert torch.equal(attn.lora_ind, torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23])) x = torch.randint(0, 8, size=(3, 5, 16), dtype=torch.int64) assert attn.zero_pad(x).shape == (3, 5, 24) @@ -103,7 +103,7 @@ def test_lora_mqa_gqa(): assert attn.linear.weight.shape == (12, 8) assert attn.lora_A.shape == (4, 8) assert attn.lora_B.shape == (10, 2) - assert attn.lora_ind == [0, 1, 2, 3, 4, 5, 6, 7, 10, 11] + assert torch.equal(attn.lora_ind, torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11])) x = torch.randint(0, 8, size=(3, 5, 10), dtype=torch.int64) assert attn.zero_pad(x).shape == (3, 5, 12) @@ -114,7 +114,7 @@ def test_lora_mqa_gqa(): assert attn.linear.weight.shape == (16, 8) assert attn.lora_A.shape == (4, 8) assert attn.lora_B.shape == (12, 2) - assert attn.lora_ind == [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15] + assert torch.equal(attn.lora_ind, torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15])) x = torch.randint(0, 8, size=(3, 5, 12), dtype=torch.int64) assert attn.zero_pad(x).shape == (3, 5, 16) From 89d8eaafd0e5562e8860136effb8f8cbc1601a7b Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Tue, 19 Sep 2023 15:51:03 +0300 Subject: [PATCH 04/14] Updates for a case with an inference tensor --- lit_gpt/lora.py | 57 +++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py index 43873e1540..aa6a1421e5 100644 --- a/lit_gpt/lora.py +++ b/lit_gpt/lora.py @@ -248,31 +248,38 @@ def lora_ind(self) -> torch.Tensor: # ________________________________________ # | query | key | value | # ---------------------------------------- - if hasattr(self, "_lora_ind"): - return self._lora_ind - - indices = [] - enable_q, enable_k, enable_v = self.enable_lora - if enable_q: - indices.append(torch.arange(0, self.linear.in_features, device=self.linear.weight.device)) - if enable_k: - indices.append( - torch.arange( - self.linear.in_features, - self.linear.in_features + self.kv_embd_size, - device=self.linear.weight.device, + if not hasattr(self, "_lora_ind"): + indices = [] + enable_q, enable_k, enable_v = self.enable_lora + if enable_q: + indices.append( + torch.arange( + 0, + self.linear.in_features, + device=self.linear.weight.device, + ) ) - ) - if enable_v: - indices.append( - torch.arange( - self.linear.in_features + self.kv_embd_size, - self.linear.out_features, - device=self.linear.weight.device, + if enable_k: + indices.append( + torch.arange( + self.linear.in_features, + self.linear.in_features + self.kv_embd_size, + device=self.linear.weight.device, + ) ) - ) - self.register_buffer("_lora_ind", torch.cat(indices), persistent=False) - return self._lora_ind + if enable_v: + indices.append( + torch.arange( + self.linear.in_features + self.kv_embd_size, + self.linear.out_features, + device=self.linear.weight.device, + ) + ) + self.register_buffer("_lora_ind", torch.cat(indices), persistent=False) + + # in case `lora_ind` was created in `inference_mode` and thus it's an inference tensor, + # that cannot be saved for backward and has to be cloned + return self._lora_ind.clone() if self._lora_ind.is_inference() else self._lora_ind def zero_pad(self, x: torch.Tensor) -> torch.Tensor: """Properly pad the last dimension of weight updates with zeros. @@ -306,10 +313,8 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor: # Then x has embeddings_size of 256 (2 * 128 as enable_lora only for query and value, not keys) and expected # embeddings_size is 384 (self.linear.out_features), so that means that we need to pad from 256 to 384 with zeros, but # only for key updates (this is where self.lora_ind comes in handy) - # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors - # for example when we want to merge/unmerge LoRA weights and pretrained weights result = x.new_zeros(*x.shape[:-1], self.linear.out_features) # (64, 64, 384) - return result.index_copy(dim=-1, index=self.lora_ind.clone(), source=x) # (64, 64, 384) + return result.index_copy_(dim=-1, index=self.lora_ind, source=x) # (64, 64, 384) def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries. From 6785012bf528fa1c3414a9f74f6a6214bc697cf8 Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Tue, 19 Sep 2023 17:23:56 +0300 Subject: [PATCH 05/14] Docstring for a `lora_ind` property. --- lit_gpt/lora.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py index aa6a1421e5..4d6ad1e1dc 100644 --- a/lit_gpt/lora.py +++ b/lit_gpt/lora.py @@ -237,6 +237,7 @@ def __init__( @property def lora_ind(self) -> torch.Tensor: + """Lazy creation of a buffer with LoRA indices to overcome limitation when FSDP and meta device is used.""" # Compute the indices # Indices are needed to properly pad weight updates with zeros. If we want to fine-tune queries and values, # but not keys, then the weights update should be: From 8951570e1cf61670311b1beba537b8e2d5a39bf2 Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Tue, 19 Sep 2023 18:28:43 +0300 Subject: [PATCH 06/14] Reassign `self._lora_ind` so it will be recreated outside inference mode --- lit_gpt/lora.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py index 4d6ad1e1dc..c2b4aee269 100644 --- a/lit_gpt/lora.py +++ b/lit_gpt/lora.py @@ -280,7 +280,9 @@ def lora_ind(self) -> torch.Tensor: # in case `lora_ind` was created in `inference_mode` and thus it's an inference tensor, # that cannot be saved for backward and has to be cloned - return self._lora_ind.clone() if self._lora_ind.is_inference() else self._lora_ind + if self._lora_ind.is_inference(): + self._lora_ind = self._lora_ind.clone() + return self._lora_ind def zero_pad(self, x: torch.Tensor) -> torch.Tensor: """Properly pad the last dimension of weight updates with zeros. From eb73d271a256b8572327a18f97b8518c2322d219 Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Fri, 22 Sep 2023 17:31:24 +0300 Subject: [PATCH 07/14] Make `lora_ind` property a bit shorter. --- lit_gpt/lora.py | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py index c2b4aee269..b99b1f367c 100644 --- a/lit_gpt/lora.py +++ b/lit_gpt/lora.py @@ -252,30 +252,14 @@ def lora_ind(self) -> torch.Tensor: if not hasattr(self, "_lora_ind"): indices = [] enable_q, enable_k, enable_v = self.enable_lora + in_features, out_features = self.linear.in_features, self.linear.out_features + device = self.linear.weight.device if enable_q: - indices.append( - torch.arange( - 0, - self.linear.in_features, - device=self.linear.weight.device, - ) - ) + indices.append(torch.arange(0, in_features, device=device)) if enable_k: - indices.append( - torch.arange( - self.linear.in_features, - self.linear.in_features + self.kv_embd_size, - device=self.linear.weight.device, - ) - ) + indices.append(torch.arange(in_features, in_features + self.kv_embd_size, device=device)) if enable_v: - indices.append( - torch.arange( - self.linear.in_features + self.kv_embd_size, - self.linear.out_features, - device=self.linear.weight.device, - ) - ) + indices.append(torch.arange(in_features + self.kv_embd_size, out_features, device=device)) self.register_buffer("_lora_ind", torch.cat(indices), persistent=False) # in case `lora_ind` was created in `inference_mode` and thus it's an inference tensor, From 4cc6cb38fe660e1e4d166bf4e95e808780a3a3ac Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Fri, 22 Sep 2023 17:39:12 +0300 Subject: [PATCH 08/14] Trim comments for `lora_ind` property. --- lit_gpt/lora.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py index b99b1f367c..864fa70d31 100644 --- a/lit_gpt/lora.py +++ b/lit_gpt/lora.py @@ -237,18 +237,8 @@ def __init__( @property def lora_ind(self) -> torch.Tensor: - """Lazy creation of a buffer with LoRA indices to overcome limitation when FSDP and meta device is used.""" - # Compute the indices - # Indices are needed to properly pad weight updates with zeros. If we want to fine-tune queries and values, - # but not keys, then the weights update should be: - # - # [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,], - # [....................................], - # [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,]] - # ↑ ↑ ↑ - # ________________________________________ - # | query | key | value | - # ---------------------------------------- + """Lazy creation of a buffer with LoRA indices to overcome the limitation when FSDP with meta device is used.""" + # Indices are needed to properly pad weight updates with zeros. if not hasattr(self, "_lora_ind"): indices = [] enable_q, enable_k, enable_v = self.enable_lora From 7013fed8c15fd7e5a4d285b576532f96972521cc Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Mon, 9 Oct 2023 18:23:24 +0300 Subject: [PATCH 09/14] If validate is running in no_grad mode there is no need to clone ind --- finetune/lora.py | 2 +- lit_gpt/lora.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/finetune/lora.py b/finetune/lora.py index cc2216fb5d..bbcde868d2 100644 --- a/finetune/lora.py +++ b/finetune/lora.py @@ -250,7 +250,7 @@ def train( save_lora_checkpoint(fabric, model, checkpoint_path) -@torch.inference_mode() +@torch.no_grad() def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer) -> torch.Tensor: fabric.print("Validating ...") model.eval() diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py index 6ae7a6baa7..b98fafcabb 100644 --- a/lit_gpt/lora.py +++ b/lit_gpt/lora.py @@ -252,10 +252,6 @@ def lora_ind(self) -> torch.Tensor: indices.append(torch.arange(in_features + self.kv_embd_size, out_features, device=device)) self.register_buffer("_lora_ind", torch.cat(indices), persistent=False) - # in case `lora_ind` was created in `inference_mode` and thus it's an inference tensor, - # that cannot be saved for backward and has to be cloned - if self._lora_ind.is_inference(): - self._lora_ind = self._lora_ind.clone() return self._lora_ind def zero_pad(self, x: torch.Tensor) -> torch.Tensor: From 4b20c886626020beb6b004f38696c1f55df7cd82 Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Tue, 21 Nov 2023 13:13:40 +0300 Subject: [PATCH 10/14] Revert "Minor tutorial updates" This reverts commit e4fb763d7047c870a5c52e55aea2123a6b29b37a. --- tutorials/inference.md | 4 +- tutorials/prepare_dataset.md | 6 +- tutorials/resource-tables.md | 140 +++++++++++++++++------------------ 3 files changed, 75 insertions(+), 75 deletions(-) diff --git a/tutorials/inference.md b/tutorials/inference.md index 91366bab87..bb21cef4db 100644 --- a/tutorials/inference.md +++ b/tutorials/inference.md @@ -41,7 +41,7 @@ For instance, `falcon-40b` would require ~80 GB of GPU memory to run on a single python generate/base.py --checkpoint_dir checkpoints/tiiuae/falcon-40b --strategy fsdp --devices 4 ``` -Which will take ~25 GB of memory, and run at 2.5 tokens/sec. +Which will take 32 GB of memory, and run at 0.37 tokens/sec. Or to reduce the memory requirements even further, you can try using CPU offloading. For that, you will need to manually edit the `cpu_offload=False` parameter in the file and set it to `True`. @@ -51,5 +51,5 @@ Now we can run it on just 2 devices. python generate/base.py --checkpoint_dir checkpoints/tiiuae/falcon-40b --strategy fsdp --devices 2 ``` -taking ~5 GB of memory but running at 0.23 tokens/sec on 2 A100 40GB GPUs. +taking 13 GB of memory but running at 0.12 tokens/sec on 2 A100 40GB GPUs. Smaller devices like 3090s (24 GB) can also fit it with this technique. diff --git a/tutorials/prepare_dataset.md b/tutorials/prepare_dataset.md index 77af9fc75f..f21d16b87c 100644 --- a/tutorials/prepare_dataset.md +++ b/tutorials/prepare_dataset.md @@ -8,10 +8,10 @@ Below is a table of all datasets that are currently supported in Lit-GPT: | Alpaca | Finetuning | 51,759 samples | [URL](https://github.com/tatsu-lab/stanford_alpaca) | [URL](https://crfm.stanford.edu/2023/03/13/alpaca.html) | Attribution-NonCommercial 4.0 International, [ URL](https://crfm.stanford.edu/2023/03/13/alpaca.html) | | Alpaca Libre | Finetuning | 55,370 samples | [URL](https://github.com/mobarski/alpaca-libre) | - | CC0/MIT, [URL](https://github.com/mobarski/alpaca-libre) | | Dolly | Finetuning | 15,011 samples | [URL](https://github.com/databrickslabs/dolly/tree/master/data) | [URL](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) | CC-BY-SA, [URL](https://github.com/databrickslabs/dolly#model-overview) | -| LongForm | Finetuning | 23,652 samples | [URL](https://github.com/akoksal/LongForm) | [URL](https://arxiv.org/abs/2304.08460) | No information provided and subset-dependent, [URL](https://github.com/akoksal/LongForm) | +| LongForm | Finetuning | 23,652 samples | [URL](https://github.com/akoksal/LongForm) | [URL](https://arxiv.org/abs/2304.08460) | No information provided and subset-dependent, [URL](https://github.com/akoksal/LongForm) | | LIMA | Finetuning | 1,084 samples | [URL](https://huggingface.co/datasets/GAIR/lima) | [URL](https://arxiv.org/abs/2305.11206) | "If the source data of LIMA has a stricter license than CC BY-NC-SA, the LIMA dataset follows the same. Otherwise, it follows the CC BY-NC-SA license", [URL](https://huggingface.co/datasets/GAIR/lima#license) | | OpenWeb Text | Pretraining | 8,013,769 documents | [URL](https://github.com/jcpeterson/openwebtext) | [URL](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) | Unspecified | -| RedPajama | Pretraining | 1.2 T tokens | [URL](https://github.com/togethercomputer/RedPajama-Data) | [URL](https://together.ai/blog/redpajama-models-v1) | Subset-dependent, [URL](https://github.com/togethercomputer/RedPajama-Data#license) | +| RedPajama | Pretraining | 1.2 T tokens | [URL](https://github.com/togethercomputer/RedPajama-Data) | [URL](https://together.ai/blog/redpajama-models-v1) | Subset-dependent, [URL](https://github.com/togethercomputer/RedPajama-Data#license) | | |   @@ -139,7 +139,7 @@ The more detailed dataset composition is as follows based on a table taken from | | BEA-GEC | 1,203 | | | Enron | 372 | | **Total** | | 27,739 | -| | | | +| | | | | **Train** | | 23,652 | | **Validation** | | 2,042 | | **Test** | | 2,045 | diff --git a/tutorials/resource-tables.md b/tutorials/resource-tables.md index 67e9b37039..62eca9184c 100644 --- a/tutorials/resource-tables.md +++ b/tutorials/resource-tables.md @@ -39,35 +39,35 @@ Note that the number of tokens in the training set does not affect the supported The following experiments were conducted on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script. -| Size | Model | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | -|-------|----------------|--------------|-----------------|----------------------|-------------|--------------------|------------------------------| -| 1.3 B | phi-1.5 | None | 1 | 1,572,864 | 4.82 GB | 1.62 min | 80.91 min | -| 1.3 B | phi-1.5 | bnb.nf4 | 1 | 1,572,864 | 3.78 GB | 1.77 min | 88.36 min | -| 1.3 B | phi-1.5 | bnb.nf4-dq | 1 | 1,572,864 | 3.72 GB | 1.87 min | 93.39 min | -| 1.3 B | phi-1.5 | None | 2 | 1,572,864 | 6.76 GB | 1.65 min | 82.44 min | -| 1.3 B | phi-1.5 | None | 4 | 1,572,864 | 10.68 GB | 1.70 min | 84.79 min | -| | | | | | | | | +| Size | Model | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | +| ----- | ------------- | ------------ | --------------- | -------------------- | ----------- | ------------------ | ---------------------------- | +| 1.3 B | phi-1.5 | None | 1 | 1,572,864 | 4.82 GB | 1.62 min | 80.91 min | +| 1.3 B | phi-1.5 | bnb.nf4 | 1 | 1,572,864 | 3.78 GB | 1.77 min | 88.36 min | +| 1.3 B | phi-1.5 | bnb.nf4-dq | 1 | 1,572,864 | 3.72 GB | 1.87 min | 93.39 min | +| 1.3 B | phi-1.5 | None | 2 | 1,572,864 | 6.76 GB | 1.65 min | 82.44 min | +| 1.3 B | phi-1.5 | None | 4 | 1,572,864 | 10.68 GB | 1.70 min | 84.79 min | +| | | | | | | | | | 3 B | StableLM Alpha | None | 1 | 2,097,152 | 9.69 GB | 1.24 min | 62.23 min | | 3 B | StableLM Alpha | bnb.nf4 | 1 | 2,097,152 | 6.35 GB | 1.82 min | 91.22 min | | 3 B | StableLM Alpha | bnb.nf4-dq | 1 | 2,097,152 | 6.19 GB | 1.87 min | 93.58 min | | 3 B | StableLM Alpha | None | 2 | 2,097,152 | 12.10 GB | 1.33 min | 66.68 min | | 3 B | StableLM Alpha | None | 4 | 2,097,152 | 16.92 GB | 1.50 min | 74.89 min | -| | | | | | | | | -| 7 B | Llama 2 | None | 1 | 4,194,304 | 21.30 GB | 2.36 min | 118.03 min | -| 7 B | Llama 2 | bnb.nf4 | 1 | 4,194,304 | 14.14 GB | 3.68 min | 183.88 min | -| 7 B | Llama 2 | bnb.nf4-dq | 1 | 4,194,304 | 13.84 GB | 3.83 min | 191.66 min | -| 7 B | Llama 2 | None | 2 | 4,194,304 | 29.07 GB | 2.52 min | 125.97 min | -| 7 B | Llama 2 | None | 4 | 4,194,304 | OOM | - | - | -| | | | | | | | | -| 13 B | Llama 2 | None | 1 | 6,553,600 | 38.12 GB | 3.19 min | 159.43 min | -| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 23.14 GB | 6.38 min | 319.03 min | -| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 22.55 GB | 6.55 min | 327.32 min | -| 13 B | Llama 2 | None | 2 | 6,553,600 | OOM | - | - | -| 13 B | Llama 2 | None | 4 | 6,553,600 | OOM | - | - | -| | | | | | | | | -| 40 B | Falcon | None | 1 | 12,042,240 | OOM | - | - | -| 40 B | Falcon | bnb.nf4 | 1 | 12,042,240 | OOM | - | - | -| 40 B | Falcon | bnb.nf4-dq | 1 | 12,042,240 | OOM | - | - | +| | | | | | | | | +| 7 B | Llama 2 | None | 1 | 4,194,304 | 21.30 GB | 2.36 min | 118.03 min | +| 7 B | Llama 2 | bnb.nf4 | 1 | 4,194,304 | 14.14 GB | 3.68 min | 183.88 min | +| 7 B | Llama 2 | bnb.nf4-dq | 1 | 4,194,304 | 13.84 GB | 3.83 min | 191.66 min | +| 7 B | Llama 2 | None | 2 | 4,194,304 | 29.07 GB | 2.52 min | 125.97 min | +| 7 B | Llama 2 | None | 4 | 4,194,304 | OOM | - | - | +| | | | | | | | | +| 13 B | Llama 2 | None | 1 | 6,553,600 | 38.12 GB | 3.19 min | 159.43 min | +| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 23.14 GB | 6.38 min | 319.03 min | +| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 22.55 GB | 6.55 min | 327.32 min | +| 13 B | Llama 2 | None | 2 | 6,553,600 | OOM | - | - | +| 13 B | Llama 2 | None | 4 | 6,553,600 | OOM | - | - | +| | | | | | | | | +| 40 B | Falcon | None | 1 | 12,042,240 | OOM | - | - | +| 40 B | Falcon | bnb.nf4 | 1 | 12,042,240 | OOM | - | - | +| 40 B | Falcon | bnb.nf4-dq | 1 | 12,042,240 | OOM | - | - |   @@ -75,35 +75,35 @@ The following experiments were conducted on 1xA100 with a minibatch size of 128 The following experiments were conducted on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script. -| Size | Model | Quantization | Microbatch size | Trainable parameters | GPU | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | -|-------|----------------|--------------|-----------------|----------------------|----------|-------------|--------------------|------------------------------| -| 1.3 B | phi-1.5 | None | 1 | 1,572,864 | 2 x A100 | 4.86 GB | 3.81 min | 190.47 min | -| 1.3 B | phi-1.5 | bnb.nf4 | 1 | 1,572,864 | 2 x A100 | N/A | - | - | -| 1.3 B | phi-1.5 | bnb.nf4-dq | 1 | 1,572,864 | 2 x A100 | N/A | - | - | -| 1.3 B | phi-1.5 | None | 2 | 1,572,864 | 2 x A100 | 5.05 GB | 3.63 min | 181.31 min | -| 1.3 B | phi-1.5 | None | 4 | 1,572,864 | 2 x A100 | 5.88 GB | 3.64 min | 181.76 min | -| | | | | | | | | | +| Size | Model | Quantization | Microbatch size | Trainable parameters | GPU | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | +| ----- | ------------- | ------------ | --------------- | -------------------- | -------- | ----------- | ------------------ | ---------------------------- | +| 1.3 B | phi-1.5 | None | 1 | 1,572,864 | 2 x A100 | 4.86 GB | 3.81 min | 190.47 min | +| 1.3 B | phi-1.5 | bnb.nf4 | 1 | 1,572,864 | 2 x A100 | N/A | - | - | +| 1.3 B | phi-1.5 | bnb.nf4-dq | 1 | 1,572,864 | 2 x A100 | N/A | - | - | +| 1.3 B | phi-1.5 | None | 2 | 1,572,864 | 2 x A100 | 5.05 GB | 3.63 min | 181.31 min | +| 1.3 B | phi-1.5 | None | 4 | 1,572,864 | 2 x A100 | 5.88 GB | 3.64 min | 181.76 min | +| | | | | | | | | | | 3 B | StableLM Alpha | None | 1 | 2,097,152 | 2 x A100 | 12.75 GB | 2.92 min | 145.96 min | | 3 B | StableLM Alpha | None | 2 | 2,097,152 | 2 x A100 | 12.94 GB | 3.06 min | 153.10 min | | 3 B | StableLM Alpha | None | 4 | 2,097,152 | 2 x A100 | 13.45 GB | 3.86 min | 192.99 min | -| | | | | | | | - | - | -| 7 B | Llama 2 | None | 1 | 4,194,304 | 2 x A100 | 22.18 GB | 5.93 min | 296.62 min | -| 7 B | Llama 2 | None | 2 | 4,194,304 | 2 x A100 | 22.47 GB | 6.48 min | 324.03 min | -| 7 B | Llama 2 | None | 4 | 4,194,304 | 2 x A100 | 23.39 GB | 8.66 min | 432.82 min | -| | | | | | | | | | -| 13 B | Llama 2 | None | 1 | 6,553,600 | 2 x A100 | OOM | - | - | -| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 2 x A100 | N/A | - | - | -| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 2 x A100 | N/A | - | - | -| | | | | | | | | | -| 13 B | Llama 2 | None | 1 | 6,553,600 | 4 x A100 | 35.57 GB | 10.25 min | 512.5 min | -| 40 B | Falcon | None | 1 | 12,042,240 | 4 x A100 | OOM | - | - | +| | | | | | | | - | - | +| 7 B | Llama 2 | None | 1 | 4,194,304 | 2 x A100 | 22.18 GB | 5.93 min | 296.62 min | +| 7 B | Llama 2 | None | 2 | 4,194,304 | 2 x A100 | 22.47 GB | 6.48 min | 324.03 min | +| 7 B | Llama 2 | None | 4 | 4,194,304 | 2 x A100 | 23.39 GB | 8.66 min | 432.82 min | +| | | | | | | | | | +| 13 B | Llama 2 | None | 1 | 6,553,600 | 2 x A100 | OOM | - | - | +| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 2 x A100 | N/A | - | - | +| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 2 x A100 | N/A | - | - | +| | | | | | | | | | +| 13 B | Llama 2 | None | 1 | 6,553,600 | 4 x A100 | 35.57 GB | 10.25 min | 512.5 min | +| 40 B | Falcon | None | 1 | 12,042,240 | 4 x A100 | OOM | - | - |   ## Single-GPU Inference -| Size | Model | Quantization | GPU | Max GPU RAM | Token/sec | -|-------|----------------|--------------|----------|-------------------------------------------|-----------| +| Size | Model | Quantization | GPU | Max GPU RAM | Token/sec | +|-------|----------------|--------------|----------|--------------------------------------------|-----------| | 1.3 B | phi-1.5 | None | 1 x A100 | 2.86 GB | 42.56 | | 1.3 B | phi-1.5 | bnb.nf4 | 1 x A100 | 1.39 GB | 22.89 | | 1.3 B | phi-1.5 | bnb.nf4-dq | 1 x A100 | 1.33 GB | 22.75 | @@ -113,28 +113,28 @@ The following experiments were conducted on multiple A100 GPUs with a minibatch | 3 B | StableLM Alpha | bnb.nf4 | 1 x A100 | 3.20 GB | 29.04 | | 3 B | StableLM Alpha | bnb.nf4-dq | 1 x A100 | 3.04 GB | 27.15 | | 3 B | StableLM Alpha | gptq.int4 | 1 x A100 | 2.43 GB | 5.9 | -| | | | | | | -| 7 B | Llama 2 | None | 1 x A100 | 13.52 GB | 30.97 | -| 7 B | Llama 2 | bnb.nf4 | 1 x A100 | 4.57 GB | 19.98 | -| 7 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 4.26 GB | 17.3 | -| 7 B | Llama 2 | gptq.int4 | 1 x A100 | 3.93 GB | 5.04 | -| | | | | | | -| 13 B | Llama 2 | None | 1 x A100 | 26.21 GB | 24.82 | -| 13 B | Llama 2 | bnb.nf4 | 1 x A100 | 8.32 GB | 16.73 | -| 13 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 7.72 GB | 14.43 | -| 13 B | Llama 2 | gptq.int4 | 1 x A100 | 7.14 GB | 4.17 | -| | | | | | | -| 34 B | CodeLlama | None | 1 x A100 | OOM | - | -| 34 B | CodeLlama | bnb.nf4 | 1 x A100 | 20.52 GB | 14.32 | -| 34 B | CodeLlama | bnb.nf4-dq | 1 x A100 | 18.95 GB | 12.37 | -| 34 B | CodeLlama | gptq.int4 | 1 x A100 | OOM (quantize script) | - | -| | | | | | | -| 40 B | Falcon | None | 1 x A100 | OOM | - | -| 40 B | Falcon | bnb.nf4 | 1 x A100 | 26.55 GB | 13.25 | -| 40 B | Falcon | bnb.nf4-dq | 1 x A100 | 24.63 GB | 11.64 | -| 40 B | Falcon | gptq.int4 | 1 x A100 | OOM (quantize script) | - | -| | | | | | | -| 70 B | Llama 2 | None | 1 x A100 | OOM | - | -| 70 B | Llama 2 | bnb.nf4 | 1 x A100 | CUDA error: CUBLAS_STATUS_NOT_INITIALIZED | - | -| 70 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 37.21 GB | 7.97 | -| 70 B | Llama 2 | gptq.int4 | 1 x A100 | OOM (quantize script) | - | +| | | | | | | +| 7 B | Llama 2 | None | 1 x A100 | 13.52 GB | 30.97 | +| 7 B | Llama 2 | bnb.nf4 | 1 x A100 | 4.57 GB | 19.98 | +| 7 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 4.26 GB | 17.3 | +| 7 B | Llama 2 | gptq.int4 | 1 x A100 | 3.93 GB | 5.04 | +| | | | | | | +| 13 B | Llama 2 | None | 1 x A100 | 26.21 GB | 24.82 | +| 13 B | Llama 2 | bnb.nf4 | 1 x A100 | 8.32 GB | 16.73 | +| 13 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 7.72 GB | 14.43 | +| 13 B | Llama 2 | gptq.int4 | 1 x A100 | 7.14 GB | 4.17 | +| | | | | | | +| 34 B | CodeLlama | None | 1 x A100 | OOM | - | +| 34 B | CodeLlama | bnb.nf4 | 1 x A100 | 20.52 GB | 14.32 | +| 34 B | CodeLlama | bnb.nf4-dq | 1 x A100 | 18.95 GB | 12.37 | +| 34 B | CodeLlama | gptq.int4 | 1 x A100 | OOM (quantize script) | - | +| | | | | | | +| 40 B | Falcon | None | 1 x A100 | OOM | - | +| 40 B | Falcon | bnb.nf4 | 1 x A100 | 26.55 GB | 13.25 | +| 40 B | Falcon | bnb.nf4-dq | 1 x A100 | 24.63 GB | 11.64 | +| 40 B | Falcon | gptq.int4 | 1 x A100 | OOM (quantize script) | - | +| | | | | | | +| 70 B | Llama 2 | None | 1 x A100 | OOM | - | +| 70 B | Llama 2 | bnb.nf4 | 1 x A100 | CUDA error: CUBLAS_STATUS_NOT_INITIALIZED | - | +| 70 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 37.21 GB | 7.97 | +| 70 B | Llama 2 | gptq.int4 | 1 x A100 | OOM (quantize script) | - | From daf0e4eb2a44636675eed7ab579f80369c8a7dd5 Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Tue, 21 Nov 2023 13:13:56 +0300 Subject: [PATCH 11/14] Revert "Fix typo" This reverts commit e05c9b4783366dcd7196aefddddc5b280af4b022. --- tutorials/resource-tables.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorials/resource-tables.md b/tutorials/resource-tables.md index 62eca9184c..b8b18913ea 100644 --- a/tutorials/resource-tables.md +++ b/tutorials/resource-tables.md @@ -37,7 +37,7 @@ Note that the number of tokens in the training set does not affect the supported ## Finetuning with LoRA on 1 GPU -The following experiments were conducted on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script. +The following experiments were conducated on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script. | Size | Model | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | | ----- | ------------- | ------------ | --------------- | -------------------- | ----------- | ------------------ | ---------------------------- | @@ -73,7 +73,7 @@ The following experiments were conducted on 1xA100 with a minibatch size of 128 ## Finetuning with LoRA on Multiple GPUs -The following experiments were conducted on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script. +The following experiments were conducated on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script. | Size | Model | Quantization | Microbatch size | Trainable parameters | GPU | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | | ----- | ------------- | ------------ | --------------- | -------------------- | -------- | ----------- | ------------------ | ---------------------------- | From 652d7bdeca690911629d9245ab268042828edeee Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Tue, 21 Nov 2023 13:14:21 +0300 Subject: [PATCH 12/14] Revert "Revert "Minor tutorial updates"" This reverts commit 445af8ba970f7b652dad84582a8bd9a0d1c640af. --- tutorials/inference.md | 4 +- tutorials/prepare_dataset.md | 6 +- tutorials/resource-tables.md | 140 +++++++++++++++++------------------ 3 files changed, 75 insertions(+), 75 deletions(-) diff --git a/tutorials/inference.md b/tutorials/inference.md index bb21cef4db..91366bab87 100644 --- a/tutorials/inference.md +++ b/tutorials/inference.md @@ -41,7 +41,7 @@ For instance, `falcon-40b` would require ~80 GB of GPU memory to run on a single python generate/base.py --checkpoint_dir checkpoints/tiiuae/falcon-40b --strategy fsdp --devices 4 ``` -Which will take 32 GB of memory, and run at 0.37 tokens/sec. +Which will take ~25 GB of memory, and run at 2.5 tokens/sec. Or to reduce the memory requirements even further, you can try using CPU offloading. For that, you will need to manually edit the `cpu_offload=False` parameter in the file and set it to `True`. @@ -51,5 +51,5 @@ Now we can run it on just 2 devices. python generate/base.py --checkpoint_dir checkpoints/tiiuae/falcon-40b --strategy fsdp --devices 2 ``` -taking 13 GB of memory but running at 0.12 tokens/sec on 2 A100 40GB GPUs. +taking ~5 GB of memory but running at 0.23 tokens/sec on 2 A100 40GB GPUs. Smaller devices like 3090s (24 GB) can also fit it with this technique. diff --git a/tutorials/prepare_dataset.md b/tutorials/prepare_dataset.md index f21d16b87c..77af9fc75f 100644 --- a/tutorials/prepare_dataset.md +++ b/tutorials/prepare_dataset.md @@ -8,10 +8,10 @@ Below is a table of all datasets that are currently supported in Lit-GPT: | Alpaca | Finetuning | 51,759 samples | [URL](https://github.com/tatsu-lab/stanford_alpaca) | [URL](https://crfm.stanford.edu/2023/03/13/alpaca.html) | Attribution-NonCommercial 4.0 International, [ URL](https://crfm.stanford.edu/2023/03/13/alpaca.html) | | Alpaca Libre | Finetuning | 55,370 samples | [URL](https://github.com/mobarski/alpaca-libre) | - | CC0/MIT, [URL](https://github.com/mobarski/alpaca-libre) | | Dolly | Finetuning | 15,011 samples | [URL](https://github.com/databrickslabs/dolly/tree/master/data) | [URL](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) | CC-BY-SA, [URL](https://github.com/databrickslabs/dolly#model-overview) | -| LongForm | Finetuning | 23,652 samples | [URL](https://github.com/akoksal/LongForm) | [URL](https://arxiv.org/abs/2304.08460) | No information provided and subset-dependent, [URL](https://github.com/akoksal/LongForm) | +| LongForm | Finetuning | 23,652 samples | [URL](https://github.com/akoksal/LongForm) | [URL](https://arxiv.org/abs/2304.08460) | No information provided and subset-dependent, [URL](https://github.com/akoksal/LongForm) | | LIMA | Finetuning | 1,084 samples | [URL](https://huggingface.co/datasets/GAIR/lima) | [URL](https://arxiv.org/abs/2305.11206) | "If the source data of LIMA has a stricter license than CC BY-NC-SA, the LIMA dataset follows the same. Otherwise, it follows the CC BY-NC-SA license", [URL](https://huggingface.co/datasets/GAIR/lima#license) | | OpenWeb Text | Pretraining | 8,013,769 documents | [URL](https://github.com/jcpeterson/openwebtext) | [URL](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) | Unspecified | -| RedPajama | Pretraining | 1.2 T tokens | [URL](https://github.com/togethercomputer/RedPajama-Data) | [URL](https://together.ai/blog/redpajama-models-v1) | Subset-dependent, [URL](https://github.com/togethercomputer/RedPajama-Data#license) | | | +| RedPajama | Pretraining | 1.2 T tokens | [URL](https://github.com/togethercomputer/RedPajama-Data) | [URL](https://together.ai/blog/redpajama-models-v1) | Subset-dependent, [URL](https://github.com/togethercomputer/RedPajama-Data#license) |   @@ -139,7 +139,7 @@ The more detailed dataset composition is as follows based on a table taken from | | BEA-GEC | 1,203 | | | Enron | 372 | | **Total** | | 27,739 | -| | | | +| | | | | **Train** | | 23,652 | | **Validation** | | 2,042 | | **Test** | | 2,045 | diff --git a/tutorials/resource-tables.md b/tutorials/resource-tables.md index b8b18913ea..193684f6b8 100644 --- a/tutorials/resource-tables.md +++ b/tutorials/resource-tables.md @@ -39,35 +39,35 @@ Note that the number of tokens in the training set does not affect the supported The following experiments were conducated on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script. -| Size | Model | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | -| ----- | ------------- | ------------ | --------------- | -------------------- | ----------- | ------------------ | ---------------------------- | -| 1.3 B | phi-1.5 | None | 1 | 1,572,864 | 4.82 GB | 1.62 min | 80.91 min | -| 1.3 B | phi-1.5 | bnb.nf4 | 1 | 1,572,864 | 3.78 GB | 1.77 min | 88.36 min | -| 1.3 B | phi-1.5 | bnb.nf4-dq | 1 | 1,572,864 | 3.72 GB | 1.87 min | 93.39 min | -| 1.3 B | phi-1.5 | None | 2 | 1,572,864 | 6.76 GB | 1.65 min | 82.44 min | -| 1.3 B | phi-1.5 | None | 4 | 1,572,864 | 10.68 GB | 1.70 min | 84.79 min | -| | | | | | | | | +| Size | Model | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | +|-------|----------------|--------------|-----------------|----------------------|-------------|--------------------|------------------------------| +| 1.3 B | phi-1.5 | None | 1 | 1,572,864 | 4.82 GB | 1.62 min | 80.91 min | +| 1.3 B | phi-1.5 | bnb.nf4 | 1 | 1,572,864 | 3.78 GB | 1.77 min | 88.36 min | +| 1.3 B | phi-1.5 | bnb.nf4-dq | 1 | 1,572,864 | 3.72 GB | 1.87 min | 93.39 min | +| 1.3 B | phi-1.5 | None | 2 | 1,572,864 | 6.76 GB | 1.65 min | 82.44 min | +| 1.3 B | phi-1.5 | None | 4 | 1,572,864 | 10.68 GB | 1.70 min | 84.79 min | +| | | | | | | | | | 3 B | StableLM Alpha | None | 1 | 2,097,152 | 9.69 GB | 1.24 min | 62.23 min | | 3 B | StableLM Alpha | bnb.nf4 | 1 | 2,097,152 | 6.35 GB | 1.82 min | 91.22 min | | 3 B | StableLM Alpha | bnb.nf4-dq | 1 | 2,097,152 | 6.19 GB | 1.87 min | 93.58 min | | 3 B | StableLM Alpha | None | 2 | 2,097,152 | 12.10 GB | 1.33 min | 66.68 min | | 3 B | StableLM Alpha | None | 4 | 2,097,152 | 16.92 GB | 1.50 min | 74.89 min | -| | | | | | | | | -| 7 B | Llama 2 | None | 1 | 4,194,304 | 21.30 GB | 2.36 min | 118.03 min | -| 7 B | Llama 2 | bnb.nf4 | 1 | 4,194,304 | 14.14 GB | 3.68 min | 183.88 min | -| 7 B | Llama 2 | bnb.nf4-dq | 1 | 4,194,304 | 13.84 GB | 3.83 min | 191.66 min | -| 7 B | Llama 2 | None | 2 | 4,194,304 | 29.07 GB | 2.52 min | 125.97 min | -| 7 B | Llama 2 | None | 4 | 4,194,304 | OOM | - | - | -| | | | | | | | | -| 13 B | Llama 2 | None | 1 | 6,553,600 | 38.12 GB | 3.19 min | 159.43 min | -| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 23.14 GB | 6.38 min | 319.03 min | -| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 22.55 GB | 6.55 min | 327.32 min | -| 13 B | Llama 2 | None | 2 | 6,553,600 | OOM | - | - | -| 13 B | Llama 2 | None | 4 | 6,553,600 | OOM | - | - | -| | | | | | | | | -| 40 B | Falcon | None | 1 | 12,042,240 | OOM | - | - | -| 40 B | Falcon | bnb.nf4 | 1 | 12,042,240 | OOM | - | - | -| 40 B | Falcon | bnb.nf4-dq | 1 | 12,042,240 | OOM | - | - | +| | | | | | | | | +| 7 B | Llama 2 | None | 1 | 4,194,304 | 21.30 GB | 2.36 min | 118.03 min | +| 7 B | Llama 2 | bnb.nf4 | 1 | 4,194,304 | 14.14 GB | 3.68 min | 183.88 min | +| 7 B | Llama 2 | bnb.nf4-dq | 1 | 4,194,304 | 13.84 GB | 3.83 min | 191.66 min | +| 7 B | Llama 2 | None | 2 | 4,194,304 | 29.07 GB | 2.52 min | 125.97 min | +| 7 B | Llama 2 | None | 4 | 4,194,304 | OOM | - | - | +| | | | | | | | | +| 13 B | Llama 2 | None | 1 | 6,553,600 | 38.12 GB | 3.19 min | 159.43 min | +| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 23.14 GB | 6.38 min | 319.03 min | +| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 22.55 GB | 6.55 min | 327.32 min | +| 13 B | Llama 2 | None | 2 | 6,553,600 | OOM | - | - | +| 13 B | Llama 2 | None | 4 | 6,553,600 | OOM | - | - | +| | | | | | | | | +| 40 B | Falcon | None | 1 | 12,042,240 | OOM | - | - | +| 40 B | Falcon | bnb.nf4 | 1 | 12,042,240 | OOM | - | - | +| 40 B | Falcon | bnb.nf4-dq | 1 | 12,042,240 | OOM | - | - |   @@ -75,35 +75,35 @@ The following experiments were conducated on 1xA100 with a minibatch size of 128 The following experiments were conducated on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script. -| Size | Model | Quantization | Microbatch size | Trainable parameters | GPU | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | -| ----- | ------------- | ------------ | --------------- | -------------------- | -------- | ----------- | ------------------ | ---------------------------- | -| 1.3 B | phi-1.5 | None | 1 | 1,572,864 | 2 x A100 | 4.86 GB | 3.81 min | 190.47 min | -| 1.3 B | phi-1.5 | bnb.nf4 | 1 | 1,572,864 | 2 x A100 | N/A | - | - | -| 1.3 B | phi-1.5 | bnb.nf4-dq | 1 | 1,572,864 | 2 x A100 | N/A | - | - | -| 1.3 B | phi-1.5 | None | 2 | 1,572,864 | 2 x A100 | 5.05 GB | 3.63 min | 181.31 min | -| 1.3 B | phi-1.5 | None | 4 | 1,572,864 | 2 x A100 | 5.88 GB | 3.64 min | 181.76 min | -| | | | | | | | | | +| Size | Model | Quantization | Microbatch size | Trainable parameters | GPU | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | +|-------|----------------|--------------|-----------------|----------------------|----------|-------------|--------------------|------------------------------| +| 1.3 B | phi-1.5 | None | 1 | 1,572,864 | 2 x A100 | 4.86 GB | 3.81 min | 190.47 min | +| 1.3 B | phi-1.5 | bnb.nf4 | 1 | 1,572,864 | 2 x A100 | N/A | - | - | +| 1.3 B | phi-1.5 | bnb.nf4-dq | 1 | 1,572,864 | 2 x A100 | N/A | - | - | +| 1.3 B | phi-1.5 | None | 2 | 1,572,864 | 2 x A100 | 5.05 GB | 3.63 min | 181.31 min | +| 1.3 B | phi-1.5 | None | 4 | 1,572,864 | 2 x A100 | 5.88 GB | 3.64 min | 181.76 min | +| | | | | | | | | | | 3 B | StableLM Alpha | None | 1 | 2,097,152 | 2 x A100 | 12.75 GB | 2.92 min | 145.96 min | | 3 B | StableLM Alpha | None | 2 | 2,097,152 | 2 x A100 | 12.94 GB | 3.06 min | 153.10 min | | 3 B | StableLM Alpha | None | 4 | 2,097,152 | 2 x A100 | 13.45 GB | 3.86 min | 192.99 min | -| | | | | | | | - | - | -| 7 B | Llama 2 | None | 1 | 4,194,304 | 2 x A100 | 22.18 GB | 5.93 min | 296.62 min | -| 7 B | Llama 2 | None | 2 | 4,194,304 | 2 x A100 | 22.47 GB | 6.48 min | 324.03 min | -| 7 B | Llama 2 | None | 4 | 4,194,304 | 2 x A100 | 23.39 GB | 8.66 min | 432.82 min | -| | | | | | | | | | -| 13 B | Llama 2 | None | 1 | 6,553,600 | 2 x A100 | OOM | - | - | -| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 2 x A100 | N/A | - | - | -| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 2 x A100 | N/A | - | - | -| | | | | | | | | | -| 13 B | Llama 2 | None | 1 | 6,553,600 | 4 x A100 | 35.57 GB | 10.25 min | 512.5 min | -| 40 B | Falcon | None | 1 | 12,042,240 | 4 x A100 | OOM | - | - | +| | | | | | | | - | - | +| 7 B | Llama 2 | None | 1 | 4,194,304 | 2 x A100 | 22.18 GB | 5.93 min | 296.62 min | +| 7 B | Llama 2 | None | 2 | 4,194,304 | 2 x A100 | 22.47 GB | 6.48 min | 324.03 min | +| 7 B | Llama 2 | None | 4 | 4,194,304 | 2 x A100 | 23.39 GB | 8.66 min | 432.82 min | +| | | | | | | | | | +| 13 B | Llama 2 | None | 1 | 6,553,600 | 2 x A100 | OOM | - | - | +| 13 B | Llama 2 | bnb.nf4 | 1 | 6,553,600 | 2 x A100 | N/A | - | - | +| 13 B | Llama 2 | bnb.nf4-dq | 1 | 6,553,600 | 2 x A100 | N/A | - | - | +| | | | | | | | | | +| 13 B | Llama 2 | None | 1 | 6,553,600 | 4 x A100 | 35.57 GB | 10.25 min | 512.5 min | +| 40 B | Falcon | None | 1 | 12,042,240 | 4 x A100 | OOM | - | - |   ## Single-GPU Inference -| Size | Model | Quantization | GPU | Max GPU RAM | Token/sec | -|-------|----------------|--------------|----------|--------------------------------------------|-----------| +| Size | Model | Quantization | GPU | Max GPU RAM | Token/sec | +|-------|----------------|--------------|----------|-------------------------------------------|-----------| | 1.3 B | phi-1.5 | None | 1 x A100 | 2.86 GB | 42.56 | | 1.3 B | phi-1.5 | bnb.nf4 | 1 x A100 | 1.39 GB | 22.89 | | 1.3 B | phi-1.5 | bnb.nf4-dq | 1 x A100 | 1.33 GB | 22.75 | @@ -113,28 +113,28 @@ The following experiments were conducated on multiple A100 GPUs with a minibatch | 3 B | StableLM Alpha | bnb.nf4 | 1 x A100 | 3.20 GB | 29.04 | | 3 B | StableLM Alpha | bnb.nf4-dq | 1 x A100 | 3.04 GB | 27.15 | | 3 B | StableLM Alpha | gptq.int4 | 1 x A100 | 2.43 GB | 5.9 | -| | | | | | | -| 7 B | Llama 2 | None | 1 x A100 | 13.52 GB | 30.97 | -| 7 B | Llama 2 | bnb.nf4 | 1 x A100 | 4.57 GB | 19.98 | -| 7 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 4.26 GB | 17.3 | -| 7 B | Llama 2 | gptq.int4 | 1 x A100 | 3.93 GB | 5.04 | -| | | | | | | -| 13 B | Llama 2 | None | 1 x A100 | 26.21 GB | 24.82 | -| 13 B | Llama 2 | bnb.nf4 | 1 x A100 | 8.32 GB | 16.73 | -| 13 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 7.72 GB | 14.43 | -| 13 B | Llama 2 | gptq.int4 | 1 x A100 | 7.14 GB | 4.17 | -| | | | | | | -| 34 B | CodeLlama | None | 1 x A100 | OOM | - | -| 34 B | CodeLlama | bnb.nf4 | 1 x A100 | 20.52 GB | 14.32 | -| 34 B | CodeLlama | bnb.nf4-dq | 1 x A100 | 18.95 GB | 12.37 | -| 34 B | CodeLlama | gptq.int4 | 1 x A100 | OOM (quantize script) | - | -| | | | | | | -| 40 B | Falcon | None | 1 x A100 | OOM | - | -| 40 B | Falcon | bnb.nf4 | 1 x A100 | 26.55 GB | 13.25 | -| 40 B | Falcon | bnb.nf4-dq | 1 x A100 | 24.63 GB | 11.64 | -| 40 B | Falcon | gptq.int4 | 1 x A100 | OOM (quantize script) | - | -| | | | | | | -| 70 B | Llama 2 | None | 1 x A100 | OOM | - | -| 70 B | Llama 2 | bnb.nf4 | 1 x A100 | CUDA error: CUBLAS_STATUS_NOT_INITIALIZED | - | -| 70 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 37.21 GB | 7.97 | -| 70 B | Llama 2 | gptq.int4 | 1 x A100 | OOM (quantize script) | - | +| | | | | | | +| 7 B | Llama 2 | None | 1 x A100 | 13.52 GB | 30.97 | +| 7 B | Llama 2 | bnb.nf4 | 1 x A100 | 4.57 GB | 19.98 | +| 7 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 4.26 GB | 17.3 | +| 7 B | Llama 2 | gptq.int4 | 1 x A100 | 3.93 GB | 5.04 | +| | | | | | | +| 13 B | Llama 2 | None | 1 x A100 | 26.21 GB | 24.82 | +| 13 B | Llama 2 | bnb.nf4 | 1 x A100 | 8.32 GB | 16.73 | +| 13 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 7.72 GB | 14.43 | +| 13 B | Llama 2 | gptq.int4 | 1 x A100 | 7.14 GB | 4.17 | +| | | | | | | +| 34 B | CodeLlama | None | 1 x A100 | OOM | - | +| 34 B | CodeLlama | bnb.nf4 | 1 x A100 | 20.52 GB | 14.32 | +| 34 B | CodeLlama | bnb.nf4-dq | 1 x A100 | 18.95 GB | 12.37 | +| 34 B | CodeLlama | gptq.int4 | 1 x A100 | OOM (quantize script) | - | +| | | | | | | +| 40 B | Falcon | None | 1 x A100 | OOM | - | +| 40 B | Falcon | bnb.nf4 | 1 x A100 | 26.55 GB | 13.25 | +| 40 B | Falcon | bnb.nf4-dq | 1 x A100 | 24.63 GB | 11.64 | +| 40 B | Falcon | gptq.int4 | 1 x A100 | OOM (quantize script) | - | +| | | | | | | +| 70 B | Llama 2 | None | 1 x A100 | OOM | - | +| 70 B | Llama 2 | bnb.nf4 | 1 x A100 | CUDA error: CUBLAS_STATUS_NOT_INITIALIZED | - | +| 70 B | Llama 2 | bnb.nf4-dq | 1 x A100 | 37.21 GB | 7.97 | +| 70 B | Llama 2 | gptq.int4 | 1 x A100 | OOM (quantize script) | - | From 015f8683ba472d3e7fbc57f932ccf14a7f97fcfa Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Thu, 23 Nov 2023 17:03:22 +0300 Subject: [PATCH 13/14] Undo weirdly appeared typo. --- tutorials/resource-tables.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorials/resource-tables.md b/tutorials/resource-tables.md index 193684f6b8..67e9b37039 100644 --- a/tutorials/resource-tables.md +++ b/tutorials/resource-tables.md @@ -37,7 +37,7 @@ Note that the number of tokens in the training set does not affect the supported ## Finetuning with LoRA on 1 GPU -The following experiments were conducated on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script. +The following experiments were conducted on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script. | Size | Model | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | |-------|----------------|--------------|-----------------|----------------------|-------------|--------------------|------------------------------| @@ -73,7 +73,7 @@ The following experiments were conducated on 1xA100 with a minibatch size of 128 ## Finetuning with LoRA on Multiple GPUs -The following experiments were conducated on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script. +The following experiments were conducted on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script. | Size | Model | Quantization | Microbatch size | Trainable parameters | GPU | Max GPU RAM | Time 1k iterations | Time 50k iter (extrapolated) | |-------|----------------|--------------|-----------------|----------------------|----------|-------------|--------------------|------------------------------| From 1ef81f33b283505f1c18e6e30599148a563add8e Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Mon, 6 May 2024 16:26:35 +0300 Subject: [PATCH 14/14] Add FSDP test with empty_init=True --- tests/test_lora.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/test_lora.py b/tests/test_lora.py index 431c53cfca..1ca2843a1f 100644 --- a/tests/test_lora.py +++ b/tests/test_lora.py @@ -733,3 +733,28 @@ def test_lora_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca_pa logs = stdout.getvalue() assert "of trainable parameters: 512" in logs assert "of non-trainable parameters: 1,888" in logs + + +@RunIf(standalone=True, min_cuda_gpus=2) +def test_lora_model_fsdp_init(): + config = Config( + n_layer=1, + n_head=2, + n_embd=8, + block_size=8, + vocab_size=8, + lora_r=8, + lora_alpha=8, + lora_dropout=0.1, + lora_query=True, + lora_value=False, + lora_projection=True, + ) + fabric = Fabric(devices=2, strategy="fsdp", precision="16-true") + fabric.launch() + with fabric.init_module(empty_init=True): + model = LoRAGPT(config) + x = torch.randint(0, config.padded_vocab_size, size=(2, config.block_size), dtype=torch.int64, device=fabric.device) + model = fabric.setup(model) + y = model(x) + assert y.shape == torch.Size([2, 8, 512])