From 8a9b0ba4987a3e4d9e673f0d7e55c51cb9242fee Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Sat, 30 Nov 2024 00:50:48 -0500 Subject: [PATCH 1/5] Add Mixtral-8x22B --- README.md | 2 ++ litgpt/config.py | 20 ++++++++++++++++++++ tests/test_convert_lit_checkpoint.py | 5 +++-- tests/test_model.py | 5 +++-- tutorials/download_model_weights.md | 3 +++ 5 files changed, 31 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 87389e10fb..42060008e5 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,7 @@ Every model is written from scratch to maximize performance and remove layers of | Llama 3, 3.1, 3.2 | 1B, 3B, 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) | | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | +| Mixtral MoE | 8x22B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mixtral-of-experts/) | | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | | CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) | | Gemma 2 | 2B, 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf) | @@ -128,6 +129,7 @@ Every model is written from scratch to maximize performance and remove layers of | MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama) | | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | +| Mixtral MoE | 8x22B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mixtral-8x22b/) | | OLMo | 1B, 7B | Allen Institute for AI (AI2) | [Groeneveld et al. 2024](https://aclanthology.org/2024.acl-long.841/) | | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama) | | Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) | diff --git a/litgpt/config.py b/litgpt/config.py index 5884433372..32e260a56f 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -1520,6 +1520,26 @@ def norm_class(self) -> Type: n_expert=8, n_expert_per_token=2, ), + # https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1/blob/main/config.json + dict( + name="Mixtral-8x22B-{}v0.1", + hf_config=dict(org="mistralai", name="Mixtral-8x22B-{}v0.1"), + padded_vocab_size=32000, + block_size=65536, + n_layer=56, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + norm_class_name="RMSNorm", + norm_eps=1e-05, + mlp_class_name="LLaMAMoE", + intermediate_size=16384, + n_head=48, + rope_base=1000000, + n_expert=8, + n_expert_per_token=2, + ), ] for c in mistral: for kind in ("", "Instruct-"): diff --git a/tests/test_convert_lit_checkpoint.py b/tests/test_convert_lit_checkpoint.py index 9f27b80d21..f2fe295e62 100644 --- a/tests/test_convert_lit_checkpoint.py +++ b/tests/test_convert_lit_checkpoint.py @@ -156,9 +156,10 @@ def test_against_hf_llama2(ours_kwargs): @torch.inference_mode() -def test_against_mixtral(): +@pytest.mark.parametrize("model_name", ("Mixtral-8x7B-Instruct-v0.1", "Mixtral-8x22B-Instruct-v0.1")) +def test_against_mixtral(model_name): ours_config = Config.from_name( - "Mixtral-8x7B-Instruct-v0.1", + model_name, padded_vocab_size=10000, n_layer=2, n_embd=32, diff --git a/tests/test_model.py b/tests/test_model.py index 1a9a94efd5..644f751162 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -512,11 +512,12 @@ def test_against_mathstral_hf_models(device, dtype): @torch.inference_mode() -def test_against_hf_mixtral(): +@pytest.mark.parametrize("model_name", ("Mixtral-8x7B-Instruct-v0.1", "Mixtral-8x22B-Instruct-v0.1")) +def test_against_hf_mixtral(model_name): device = torch.device("cpu") dtype = torch.float32 ours_config = Config.from_name( - "Mixtral-8x7B-Instruct-v0.1", + model_name, padded_vocab_size=10000, n_layer=2, n_embd=32, diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md index 50c6924f63..9fabeb497a 100644 --- a/tutorials/download_model_weights.md +++ b/tutorials/download_model_weights.md @@ -26,6 +26,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights. | MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama) | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | +| Mixtral MoE | 8x22B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mixtral-8x22b/) | | Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch) | | OLMo | 1B, 7B | Allen Institute for AI (AI2) | [Groeneveld et al. 2024](https://aclanthology.org/2024.acl-long.841/) | | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama) | @@ -156,6 +157,8 @@ mistralai/Mistral-7B-v0.3 mistralai/Mistral-Large-Instruct-2407 mistralai/Mixtral-8x7B-Instruct-v0.1 mistralai/Mixtral-8x7B-v0.1 +mistralai/Mixtral-8x22B-Instruct-v0.1 +mistralai/Mixtral-8x22B-v0.1 NousResearch/Nous-Hermes-13b NousResearch/Nous-Hermes-llama-2-7b NousResearch/Nous-Hermes-Llama2-13b From 9dc5d4523594f0dffa907031f0daef1a8419f572 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Sat, 30 Nov 2024 01:00:27 -0500 Subject: [PATCH 2/5] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 42060008e5..90a813b024 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,6 @@ Every model is written from scratch to maximize performance and remove layers of | Llama 3, 3.1, 3.2 | 1B, 3B, 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) | | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | -| Mixtral MoE | 8x22B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mixtral-of-experts/) | | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | | CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) | | Gemma 2 | 2B, 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf) | From b94bf90cc0ec0ea026a1b357f1249e883f6107d9 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Sat, 30 Nov 2024 01:01:43 -0500 Subject: [PATCH 3/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 90a813b024..f2eed9c644 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ Every model is written from scratch to maximize performance and remove layers of |----|----|----|----| | Llama 3, 3.1, 3.2 | 1B, 3B, 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) | -| Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | +| Mixtral MoE | 8x7B, 8x22B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | | CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) | | Gemma 2 | 2B, 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf) | From 319f462dcfc7fa346574cfff452a6df9c9d76c29 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Sat, 30 Nov 2024 01:25:07 -0500 Subject: [PATCH 4/5] Mixtral-8x22B fix: added n_embd --- litgpt/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litgpt/config.py b/litgpt/config.py index 32e260a56f..c71233a7b8 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -1536,6 +1536,7 @@ def norm_class(self) -> Type: mlp_class_name="LLaMAMoE", intermediate_size=16384, n_head=48, + n_embd=6144, rope_base=1000000, n_expert=8, n_expert_per_token=2, From 191aa3441e562fff3e10fe1065df0ffafcc656c3 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Sat, 30 Nov 2024 09:39:04 -0500 Subject: [PATCH 5/5] Mixtral-8x22B minor vocab_size fix --- litgpt/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litgpt/config.py b/litgpt/config.py index c71233a7b8..9e1e10216d 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -1524,7 +1524,7 @@ def norm_class(self) -> Type: dict( name="Mixtral-8x22B-{}v0.1", hf_config=dict(org="mistralai", name="Mixtral-8x22B-{}v0.1"), - padded_vocab_size=32000, + padded_vocab_size=32768, block_size=65536, n_layer=56, n_query_groups=8,