Merge pull request #750 from allenai/dave/annealing_peteish_v2

Legal Whammy for 7B
allenai · Nov 26, 2024 · 1048c16 · 1048c16
2 parents 767047c + ba286ad
commit 1048c16
Show file tree

Hide file tree

Showing 43 changed files with 39,032 additions and 0 deletions.
diff --git a/...s/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm-nvidia25-fw2-se-flan.yaml b/...s/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm-nvidia25-fw2-se-flan.yaml
diff --git a/...gs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed2.yaml b/...gs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed2.yaml
diff --git a/...gs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed3.yaml b/...gs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed3.yaml
diff --git a/...gs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed4.yaml b/...gs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed4.yaml
diff --git a/...gs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed5.yaml b/...gs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-seed5.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-100B-nowup_big-number-no-whammy-2.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-100B-nowup_big-number-no-whammy-2.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-100B-nowup_legal-whammy-2.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-100B-nowup_legal-whammy-2.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-300B-nowup_legal-whammy-2.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-300B-nowup_legal-whammy-2.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-megamath_v1.1.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-megamath_v1.1.yaml
diff --git a/.../annealing/peteish7-weka-anneal-from-928646-50B-nowup-megamath1.2-dclm07-fw2-se-flan.yaml b/.../annealing/peteish7-weka-anneal-from-928646-50B-nowup-megamath1.2-dclm07-fw2-se-flan.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2.yaml
diff --git a/...annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_seed-666.yaml b/...annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_seed-666.yaml
diff --git a/...annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_seed-777.yaml b/...annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_seed-777.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3.yaml
diff --git a/...annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed1337.yaml b/...annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed1337.yaml
diff --git a/...s/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed42.yaml b/...s/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed42.yaml
diff --git a/...nnealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed42069.yaml b/...nnealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed42069.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed42.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed42.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed42069.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed42069.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed666.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup_legal-whammy-2_seed666.yaml
diff --git a/configs/microannealing/peteish7-weka-microanneal-from928646_automathtext.yaml b/configs/microannealing/peteish7-weka-microanneal-from928646_automathtext.yaml
@@ -0,0 +1,298 @@
+run_name: peteish7-weka-microanneal-from928646_automathtext
+seed: 7201
+dry_run: false
+
+wandb:
+  name: ${run_name}
+  project: olmo-medium
+  group: ${run_name}
+
+model:
+  d_model: 4096
+  n_heads: 32
+  n_layers: 32
+  mlp_hidden_size: 22016
+  weight_tying: false
+  alibi: false
+  rope: true
+  rope_theta: 500000
+  flash_attention: true
+  attention_dropout: 0.0
+  include_bias: false
+  block_type: sequential
+  layer_norm_type: rms
+  layer_norm_with_affine: true
+  layer_norm_eps: 1e-6
+  bias_for_layer_norm: false
+  attention_layer_norm: true
+  attention_layer_norm_with_affine: true
+  norm_after: true
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 4096
+  vocab_size: 100278
+  embedding_size: 100352
+  eos_token_id: 100257
+  pad_token_id: 100277
+  init_device: meta
+  init_fn: normal
+  init_std: 0.02
+  init_cutoff_factor: 3
+
+softmax_auxiliary_loss: true
+auxiliary_loss_multiplier: 1e-5
+fused_loss: true
+
+compile: null
+
+optimizer:
+  name: adamw
+  learning_rate: 0.000061499
+  weight_decay: 0.1
+  eps: 1e-8
+  decay_norm_and_bias: true
+  decay_embeddings: false
+  betas:
+  - 0.9
+  - 0.95
+  metrics_log_interval: 1
+
+scheduler:
+  name: linear_with_warmup
+  t_warmup: 0
+  alpha_f: 0
+
+tokenizer:
+  identifier: tokenizers/allenai_dolma2.json
+  truncate_direction: right
+
+save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name}
+save_overwrite: false
+
+save_interval: 1000
+save_interval_ephemeral: 250
+save_num_checkpoints_to_keep: -1
+sharded_checkpointer: olmo_core
+
+save_interval_unsharded: null
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646
+
+restore_dataloader: false
+no_pre_train_checkpoint: true
+
+max_duration: 1ep
+# stop_at: 11931                  # Relying on max_duration for anneals
+global_train_batch_size: 1024
+device_train_microbatch_size: 2
+
+precision: amp_bf16
+
+fsdp:
+  wrapping_strategy: by_block_and_size
+  precision: mixed
+
+activation_checkpointing: one_in_four
+
+max_grad_norm: 1.0
+max_grad_norm_ratio: null
+
+speed_monitor:
+  window_size: 1
+
+gen1_gc_interval: 1
+
+eval_interval: 1000
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
+evaluators:
+  # - label: all-small-ppl-validation
+  #   data:
+  #     num_workers: 0
+  #     drop_last: true
+  #     # generate_doc_lengths: true
+  #     memmap_dtype: uint32
+  #     datasets:
+  #       c4_en-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy
+  #       dolma_books-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy
+  #       dolma_common-crawl-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy
+  #       dolma_pes2o-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy
+  #       dolma_reddit-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy
+  #       dolma_stack-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy
+  #       dolma_wiki-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy
+  #       ice-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy
+  #       m2d2_s2orc-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy
+  #       pile-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy
+  #       wikitext_103-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy
+
+  ##########################
+  # Downstream evaluations #
+  ##########################
+  - label: piqa
+    type: downstream
+
+  - label: hellaswag
+    type: downstream
+
+  - label: winogrande
+    type: downstream
+
+  - label: openbook_qa
+    type: downstream
+
+  - label: boolq
+    type: downstream
+
+  - label: sciq
+    type: downstream
+
+  - label: arc_easy
+    type: downstream
+
+  - label: arc_challenge
+    type: downstream
+
+  - label: copa
+    type: downstream
+
+  #- label: rte
+  #  type: downstream
+
+  #- label: commitment_bank
+  #  type: downstream
+
+  #- label: sst2
+  #  type: downstream
+
+  - label: commonsense_qa
+    type: downstream
+
+  - label: social_iqa
+    type: downstream
+
+  - label: mmlu_stem_var
+    type: downstream
+
+  - label: mmlu_humanities_var
+    type: downstream
+
+  - label: mmlu_social_sciences_var
+    type: downstream
+
+  - label: mmlu_other_var
+    type: downstream
+
+  - label: mmlu_stem_mc_5shot
+    type: downstream
+
+  - label: mmlu_humanities_mc_5shot
+    type: downstream
+
+  - label: mmlu_social_sciences_mc_5shot
+    type: downstream
+
+  - label: mmlu_other_mc_5shot
+    type: downstream
+
+  - label: mmlu_stem_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_humanities_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_social_sciences_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_other_mc_5shot_test
+    type: downstream
+
+  - label: basic_arithmetic
+    type: downstream
+
+  - label: trivia_qa_wiki_ppl
+    type: downstream
+
+  - label: natural_qs_open_ppl
+    type: downstream
+
+  - label: arc_easy_ppl
+    type: downstream
+
+data:
+  pad_direction: right
+  # generate_doc_lengths: true
+  num_workers: 32
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 8
+  persistent_workers: true
+  memmap_dtype: uint32
+  timeout: 0
+  instance_filter:
+    repetition_max_period: 13
+    repetition_min_period: 1
+    repetition_max_count: 32
+  paths:
+    #SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/ (5.23BT)
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-26-00000.npy
+    #SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (6.07BT)
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy