From e4d3010ab95476fb7285ef6ab2f490f5c2636557 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Celiebak=E2=80=9D?= <elie.bakouch@huggingface.co>
Date: Mon, 27 May 2024 10:39:04 +0000
Subject: [PATCH 1/5] Add 1-sqrt function for the cooldown phase.

---
 src/nanotron/config/config.py | 6 +++---
 src/nanotron/helpers.py       | 6 ++++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index d9946f26..706ad35f 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -231,7 +231,7 @@ class LRSchedulerArgs:
 
     lr_warmup_steps: number of steps to warmup the learning rate
     lr_warmup_style: linear or constant
-    lr_decay_style: linear or cosine
+    lr_decay_style: linear,cosine or 1-sqrt
     min_decay_lr: minimum learning rate after decay
     lr_decay_steps: optional number of steps to decay the learning rate otherwise will default to train_steps - lr_warmup_steps
     lr_decay_starting_step: optional number of steps to decay the learning rate otherwise will default to train_steps - lr_warmup_steps
@@ -254,9 +254,9 @@ def __post_init__(self):
             self.lr_warmup_style = "linear"
         if self.lr_decay_style is None:
             self.lr_decay_style = "linear"
-        if self.lr_decay_style not in ["linear", "cosine"]:
+        if self.lr_decay_style not in ["linear", "cosine", "1-sqrt"]:
             raise ValueError(
-                f"lr_decay_style should be a string selected in ['linear', 'cosine'] and not {self.lr_decay_style}"
+                f"lr_decay_style should be a string selected in ['linear', 'cosine', '1-sqrt'] and not {self.lr_decay_style}"
             )
         if self.min_decay_lr is None:
             self.min_decay_lr = self.learning_rate
diff --git a/src/nanotron/helpers.py b/src/nanotron/helpers.py
index f7bf63e5..a82f0294 100644
--- a/src/nanotron/helpers.py
+++ b/src/nanotron/helpers.py
@@ -146,6 +146,12 @@ def lr_lambda(current_step: int, initial_lr: float):
                     * (lr_decay_steps - (current_step - lr_decay_starting_step))
                     / lr_decay_steps
                 )
+            elif lr_scheduler_args.lr_decay_style == "1-sqrt":
+                lmbda = (
+                    lr_scheduler_args.min_decay_lr
+                    + (initial_lr - lr_scheduler_args.min_decay_lr)
+                    * (1 - math.sqrt((current_step - lr_decay_starting_step) / lr_decay_steps))
+                )
             else:
                 raise ValueError(f"Unknown decay style {lr_scheduler_args.lr_decay_style}")
 

From 180faf42d80514b636fe293d1a6623e09857f5dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Celiebak=E2=80=9D?= <elie.bakouch@huggingface.co>
Date: Mon, 27 May 2024 10:43:00 +0000
Subject: [PATCH 2/5] fix typo

---
 src/nanotron/config/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index 706ad35f..619c776f 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -231,7 +231,7 @@ class LRSchedulerArgs:
 
     lr_warmup_steps: number of steps to warmup the learning rate
     lr_warmup_style: linear or constant
-    lr_decay_style: linear,cosine or 1-sqrt
+    lr_decay_style: linear, cosine or 1-sqrt
     min_decay_lr: minimum learning rate after decay
     lr_decay_steps: optional number of steps to decay the learning rate otherwise will default to train_steps - lr_warmup_steps
     lr_decay_starting_step: optional number of steps to decay the learning rate otherwise will default to train_steps - lr_warmup_steps

From 97c9780a8d0e5b1b41659770abed9c9845c490dd Mon Sep 17 00:00:00 2001
From: Jeffrey Quesnelle <emozilla@nousresearch.com>
Date: Wed, 29 May 2024 15:51:51 -0400
Subject: [PATCH 3/5] add rope_theta to hf conversion script

---
 examples/llama/convert_weights.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/llama/convert_weights.py b/examples/llama/convert_weights.py
index 3e5f830c..7663399a 100644
--- a/examples/llama/convert_weights.py
+++ b/examples/llama/convert_weights.py
@@ -71,6 +71,7 @@ def get_config_mapping(nt_to_hf: bool = True) -> dict[str, str]:
         "pretraining_tp": "pretraining_tp",
         "rms_norm_eps": "rms_norm_eps",
         "rope_scaling": "rope_scaling",
+        "rope_theta": "rope_theta",
         "tie_word_embeddings": "tie_word_embeddings",
         "use_cache": "use_cache",
         "vocab_size": "vocab_size",

From 1753921010474ce6ceae96aa8bf453af385f2393 Mon Sep 17 00:00:00 2001
From: Luc Georges <luc.sydney.georges@gmail.com>
Date: Mon, 10 Jun 2024 10:25:41 +0200
Subject: [PATCH 4/5] feat(ci): add trufflehog secrets detection

---
 .github/workflows/trufflehog.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 .github/workflows/trufflehog.yml

diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml
new file mode 100644
index 00000000..ba6fdda9
--- /dev/null
+++ b/.github/workflows/trufflehog.yml
@@ -0,0 +1,21 @@
+on:
+  push:
+
+name: Secret Leaks
+
+permissions:
+  contents: read
+  id-token: write
+  issues: write
+  pull-requests: write
+
+jobs:
+  trufflehog:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Secret Scanning
+      uses: trufflesecurity/trufflehog@main

From 1db85f3942faa4d78f7371d387631323f3ed5a74 Mon Sep 17 00:00:00 2001
From: Luc Georges <luc.sydney.georges@gmail.com>
Date: Mon, 10 Jun 2024 10:46:36 +0200
Subject: [PATCH 5/5] fix(ci): remove unnecessary permissions

---
 .github/workflows/trufflehog.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml
index ba6fdda9..9cbbf680 100644
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@@ -3,12 +3,6 @@ on:
 
 name: Secret Leaks
 
-permissions:
-  contents: read
-  id-token: write
-  issues: write
-  pull-requests: write
-
 jobs:
   trufflehog:
     runs-on: ubuntu-latest