diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml index 2f23c0f8d6..59a3ce3075 100644 --- a/.github/workflows/cpu-tests.yml +++ b/.github/workflows/cpu-tests.yml @@ -16,7 +16,6 @@ defaults: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} - UV_HTTP_TIMEOUT: 500 jobs: cpu-tests: diff --git a/README.md b/README.md index 8e14f547aa..692525df48 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,7 @@ Use, Finetune, pretrain, deploy over 20+ LLMs ([full list](tutorials/download_mo | Model | Model size | Author | Reference | |----|----|----|----| +| CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) | | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) | | Dolly | 3B, 7B, 12B | Databricks | [Conover et al. 2023](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) | | Falcon | 7B, 40B, 180B | TII UAE | [TII 2023](https://falconllm.tii.ae) | diff --git a/litgpt/config.py b/litgpt/config.py index caad1454b9..0a4234222d 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -888,6 +888,32 @@ def norm_class(self) -> Type: copy["hf_config"]["name"] = f"{c['hf_config']['name']}-it" configs.append(copy) +################## +# Google CodeGemma +################## +codegemma = [ + # https://huggingface.co/google/codegemma-7b-it/blob/main/config.json + dict( + name="CodeGemma-7b-it", + hf_config=dict(org="google", name="codegemma-7b-it"), + scale_embeddings=True, + vocab_size=256000, + padding_multiple=64, + n_embd=3072, + n_layer=28, + n_head=16, + head_size=256, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + norm_class_name="RMSNorm", + mlp_class_name="GemmaMLP", + gelu_approximate="tanh", + intermediate_size=24576, + ), +] +configs.extend(codegemma) + ########################## # Stability AI FreeWilly2 diff --git a/litgpt/prompts.py b/litgpt/prompts.py index d1266c731b..2d989be32b 100644 --- a/litgpt/prompts.py +++ b/litgpt/prompts.py @@ -330,7 +330,7 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle: return Phi2() if re.search(r"tiny-llama.*chat", model_name): return TinyLlama() - if re.search(r"Gemma.*-it", model_name): + if re.search(r"(Code)?Gemma.*-it", model_name): return Gemma() return Default() diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md index 55c214a01c..b91afa5929 100644 --- a/tutorials/download_model_weights.md +++ b/tutorials/download_model_weights.md @@ -5,6 +5,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights. | Model | Model size | Reference | |----------------------------------------------|------------------------------------------|------------------------------------------------------------------------------------------------------------------------------| +| CodeGemma by Google | 7B | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) | | Code Llama by Meta AI | 7B, 13B, 34B, 70B | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) | | Dolly by Databricks | 3B, 7B, 12B | [Conover et al. 2023](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) | | Falcon by TII UAE | 7B, 40B, 180B | [TII 2023](https://falconllm.tii.ae) | @@ -84,6 +85,7 @@ garage-bAInd/Platypus2-70B garage-bAInd/Platypus2-70B-instruct garage-bAInd/Platypus2-7B garage-bAInd/Stable-Platypus2-13B +google/codegemma-7b-it google/gemma-2b google/gemma-2b-it google/gemma-7b