From e4d3010ab95476fb7285ef6ab2f490f5c2636557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Celiebak=E2=80=9D?= Date: Mon, 27 May 2024 10:39:04 +0000 Subject: [PATCH 1/5] Add 1-sqrt function for the cooldown phase. --- src/nanotron/config/config.py | 6 +++--- src/nanotron/helpers.py | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py index d9946f26..706ad35f 100644 --- a/src/nanotron/config/config.py +++ b/src/nanotron/config/config.py @@ -231,7 +231,7 @@ class LRSchedulerArgs: lr_warmup_steps: number of steps to warmup the learning rate lr_warmup_style: linear or constant - lr_decay_style: linear or cosine + lr_decay_style: linear,cosine or 1-sqrt min_decay_lr: minimum learning rate after decay lr_decay_steps: optional number of steps to decay the learning rate otherwise will default to train_steps - lr_warmup_steps lr_decay_starting_step: optional number of steps to decay the learning rate otherwise will default to train_steps - lr_warmup_steps @@ -254,9 +254,9 @@ def __post_init__(self): self.lr_warmup_style = "linear" if self.lr_decay_style is None: self.lr_decay_style = "linear" - if self.lr_decay_style not in ["linear", "cosine"]: + if self.lr_decay_style not in ["linear", "cosine", "1-sqrt"]: raise ValueError( - f"lr_decay_style should be a string selected in ['linear', 'cosine'] and not {self.lr_decay_style}" + f"lr_decay_style should be a string selected in ['linear', 'cosine', '1-sqrt'] and not {self.lr_decay_style}" ) if self.min_decay_lr is None: self.min_decay_lr = self.learning_rate diff --git a/src/nanotron/helpers.py b/src/nanotron/helpers.py index f7bf63e5..a82f0294 100644 --- a/src/nanotron/helpers.py +++ b/src/nanotron/helpers.py @@ -146,6 +146,12 @@ def lr_lambda(current_step: int, initial_lr: float): * (lr_decay_steps - (current_step - lr_decay_starting_step)) / lr_decay_steps ) + elif lr_scheduler_args.lr_decay_style == "1-sqrt": + lmbda = ( + lr_scheduler_args.min_decay_lr + + (initial_lr - lr_scheduler_args.min_decay_lr) + * (1 - math.sqrt((current_step - lr_decay_starting_step) / lr_decay_steps)) + ) else: raise ValueError(f"Unknown decay style {lr_scheduler_args.lr_decay_style}") From 180faf42d80514b636fe293d1a6623e09857f5dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Celiebak=E2=80=9D?= Date: Mon, 27 May 2024 10:43:00 +0000 Subject: [PATCH 2/5] fix typo --- src/nanotron/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py index 706ad35f..619c776f 100644 --- a/src/nanotron/config/config.py +++ b/src/nanotron/config/config.py @@ -231,7 +231,7 @@ class LRSchedulerArgs: lr_warmup_steps: number of steps to warmup the learning rate lr_warmup_style: linear or constant - lr_decay_style: linear,cosine or 1-sqrt + lr_decay_style: linear, cosine or 1-sqrt min_decay_lr: minimum learning rate after decay lr_decay_steps: optional number of steps to decay the learning rate otherwise will default to train_steps - lr_warmup_steps lr_decay_starting_step: optional number of steps to decay the learning rate otherwise will default to train_steps - lr_warmup_steps From 97c9780a8d0e5b1b41659770abed9c9845c490dd Mon Sep 17 00:00:00 2001 From: Jeffrey Quesnelle Date: Wed, 29 May 2024 15:51:51 -0400 Subject: [PATCH 3/5] add rope_theta to hf conversion script --- examples/llama/convert_weights.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/llama/convert_weights.py b/examples/llama/convert_weights.py index 3e5f830c..7663399a 100644 --- a/examples/llama/convert_weights.py +++ b/examples/llama/convert_weights.py @@ -71,6 +71,7 @@ def get_config_mapping(nt_to_hf: bool = True) -> dict[str, str]: "pretraining_tp": "pretraining_tp", "rms_norm_eps": "rms_norm_eps", "rope_scaling": "rope_scaling", + "rope_theta": "rope_theta", "tie_word_embeddings": "tie_word_embeddings", "use_cache": "use_cache", "vocab_size": "vocab_size", From 1753921010474ce6ceae96aa8bf453af385f2393 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Mon, 10 Jun 2024 10:25:41 +0200 Subject: [PATCH 4/5] feat(ci): add trufflehog secrets detection --- .github/workflows/trufflehog.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/trufflehog.yml diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml new file mode 100644 index 00000000..ba6fdda9 --- /dev/null +++ b/.github/workflows/trufflehog.yml @@ -0,0 +1,21 @@ +on: + push: + +name: Secret Leaks + +permissions: + contents: read + id-token: write + issues: write + pull-requests: write + +jobs: + trufflehog: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Secret Scanning + uses: trufflesecurity/trufflehog@main From 1db85f3942faa4d78f7371d387631323f3ed5a74 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Mon, 10 Jun 2024 10:46:36 +0200 Subject: [PATCH 5/5] fix(ci): remove unnecessary permissions --- .github/workflows/trufflehog.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml index ba6fdda9..9cbbf680 100644 --- a/.github/workflows/trufflehog.yml +++ b/.github/workflows/trufflehog.yml @@ -3,12 +3,6 @@ on: name: Secret Leaks -permissions: - contents: read - id-token: write - issues: write - pull-requests: write - jobs: trufflehog: runs-on: ubuntu-latest