From 8a9b0ba4987a3e4d9e673f0d7e55c51cb9242fee Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Sat, 30 Nov 2024 00:50:48 -0500
Subject: [PATCH 1/5] Add Mixtral-8x22B

---
 README.md                            |  2 ++
 litgpt/config.py                     | 20 ++++++++++++++++++++
 tests/test_convert_lit_checkpoint.py |  5 +++--
 tests/test_model.py                  |  5 +++--
 tutorials/download_model_weights.md  |  3 +++
 5 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 87389e10fb..42060008e5 100644
--- a/README.md
+++ b/README.md
@@ -99,6 +99,7 @@ Every model is written from scratch to maximize performance and remove layers of
 | Llama 3, 3.1, 3.2 | 1B, 3B, 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                           |
 | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950)                                       |
 | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                         |
+| Mixtral MoE | 8x22B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mixtral-of-experts/)                                  |
 | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                      |
 | CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma)                                     |
 | Gemma 2 | 2B, 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf)  |
@@ -128,6 +129,7 @@ Every model is written from scratch to maximize performance and remove layers of
 | MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama)                                                                             |
 | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                                                     |
 | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                                                  |
+| Mixtral MoE | 8x22B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mixtral-8x22b/)                                                                         |
 | OLMo | 1B, 7B | Allen Institute for AI (AI2) | [Groeneveld et al. 2024](https://aclanthology.org/2024.acl-long.841/)    |
 | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                                         |
 | Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                  |
diff --git a/litgpt/config.py b/litgpt/config.py
index 5884433372..32e260a56f 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -1520,6 +1520,26 @@ def norm_class(self) -> Type:
         n_expert=8,
         n_expert_per_token=2,
     ),
+    # https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1/blob/main/config.json
+    dict(
+        name="Mixtral-8x22B-{}v0.1",
+        hf_config=dict(org="mistralai", name="Mixtral-8x22B-{}v0.1"),
+        padded_vocab_size=32000,
+        block_size=65536,
+        n_layer=56,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="RMSNorm",
+        norm_eps=1e-05,
+        mlp_class_name="LLaMAMoE",
+        intermediate_size=16384,
+        n_head=48,
+        rope_base=1000000,
+        n_expert=8,
+        n_expert_per_token=2,
+    ),
 ]
 for c in mistral:
     for kind in ("", "Instruct-"):
diff --git a/tests/test_convert_lit_checkpoint.py b/tests/test_convert_lit_checkpoint.py
index 9f27b80d21..f2fe295e62 100644
--- a/tests/test_convert_lit_checkpoint.py
+++ b/tests/test_convert_lit_checkpoint.py
@@ -156,9 +156,10 @@ def test_against_hf_llama2(ours_kwargs):
 
 
 @torch.inference_mode()
-def test_against_mixtral():
+@pytest.mark.parametrize("model_name", ("Mixtral-8x7B-Instruct-v0.1", "Mixtral-8x22B-Instruct-v0.1"))
+def test_against_mixtral(model_name):
     ours_config = Config.from_name(
-        "Mixtral-8x7B-Instruct-v0.1",
+        model_name,
         padded_vocab_size=10000,
         n_layer=2,
         n_embd=32,
diff --git a/tests/test_model.py b/tests/test_model.py
index 1a9a94efd5..644f751162 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -512,11 +512,12 @@ def test_against_mathstral_hf_models(device, dtype):
 
 
 @torch.inference_mode()
-def test_against_hf_mixtral():
+@pytest.mark.parametrize("model_name", ("Mixtral-8x7B-Instruct-v0.1", "Mixtral-8x22B-Instruct-v0.1"))
+def test_against_hf_mixtral(model_name):
     device = torch.device("cpu")
     dtype = torch.float32
     ours_config = Config.from_name(
-        "Mixtral-8x7B-Instruct-v0.1",
+        model_name,
         padded_vocab_size=10000,
         n_layer=2,
         n_embd=32,
diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
index 50c6924f63..9fabeb497a 100644
--- a/tutorials/download_model_weights.md
+++ b/tutorials/download_model_weights.md
@@ -26,6 +26,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama)
 | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                                                     |
 | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                                                        |
+| Mixtral MoE | 8x22B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mixtral-8x22b/)                                                                         |
 | Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch)                                                                          |
 | OLMo | 1B, 7B | Allen Institute for AI (AI2) | [Groeneveld et al. 2024](https://aclanthology.org/2024.acl-long.841/)     |
 | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                                         |
@@ -156,6 +157,8 @@ mistralai/Mistral-7B-v0.3
 mistralai/Mistral-Large-Instruct-2407
 mistralai/Mixtral-8x7B-Instruct-v0.1
 mistralai/Mixtral-8x7B-v0.1
+mistralai/Mixtral-8x22B-Instruct-v0.1
+mistralai/Mixtral-8x22B-v0.1
 NousResearch/Nous-Hermes-13b
 NousResearch/Nous-Hermes-llama-2-7b
 NousResearch/Nous-Hermes-Llama2-13b

From 9dc5d4523594f0dffa907031f0daef1a8419f572 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Sat, 30 Nov 2024 01:00:27 -0500
Subject: [PATCH 2/5] Update README.md

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 42060008e5..90a813b024 100644
--- a/README.md
+++ b/README.md
@@ -99,7 +99,6 @@ Every model is written from scratch to maximize performance and remove layers of
 | Llama 3, 3.1, 3.2 | 1B, 3B, 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                           |
 | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950)                                       |
 | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                         |
-| Mixtral MoE | 8x22B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mixtral-of-experts/)                                  |
 | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                      |
 | CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma)                                     |
 | Gemma 2 | 2B, 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf)  |

From b94bf90cc0ec0ea026a1b357f1249e883f6107d9 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Sat, 30 Nov 2024 01:01:43 -0500
Subject: [PATCH 3/5] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 90a813b024..f2eed9c644 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,7 @@ Every model is written from scratch to maximize performance and remove layers of
 |----|----|----|----|
 | Llama 3, 3.1, 3.2 | 1B, 3B, 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3)                                           |
 | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950)                                       |
-| Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                         |
+| Mixtral MoE | 8x7B, 8x22B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                  |
 | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                      |
 | CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma)                                     |
 | Gemma 2 | 2B, 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf)  |

From 319f462dcfc7fa346574cfff452a6df9c9d76c29 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Sat, 30 Nov 2024 01:25:07 -0500
Subject: [PATCH 4/5] Mixtral-8x22B fix: added n_embd

---
 litgpt/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litgpt/config.py b/litgpt/config.py
index 32e260a56f..c71233a7b8 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -1536,6 +1536,7 @@ def norm_class(self) -> Type:
         mlp_class_name="LLaMAMoE",
         intermediate_size=16384,
         n_head=48,
+        n_embd=6144,
         rope_base=1000000,
         n_expert=8,
         n_expert_per_token=2,

From 191aa3441e562fff3e10fe1065df0ffafcc656c3 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Sat, 30 Nov 2024 09:39:04 -0500
Subject: [PATCH 5/5] Mixtral-8x22B minor vocab_size fix

---
 litgpt/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litgpt/config.py b/litgpt/config.py
index c71233a7b8..9e1e10216d 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -1524,7 +1524,7 @@ def norm_class(self) -> Type:
     dict(
         name="Mixtral-8x22B-{}v0.1",
         hf_config=dict(org="mistralai", name="Mixtral-8x22B-{}v0.1"),
-        padded_vocab_size=32000,
+        padded_vocab_size=32768,
         block_size=65536,
         n_layer=56,
         n_query_groups=8,