Skip to content

Commit

Permalink
Merge pull request #750 from allenai/dave/annealing_peteish_v2
Browse files Browse the repository at this point in the history
Legal Whammy for 7B
  • Loading branch information
dirkgr authored Nov 26, 2024
2 parents 767047c + ba286ad commit 1048c16
Show file tree
Hide file tree
Showing 43 changed files with 39,032 additions and 0 deletions.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
run_name: peteish7-weka-microanneal-from928646_automathtext
seed: 7201
dry_run: false

wandb:
name: ${run_name}
project: olmo-medium
group: ${run_name}

model:
d_model: 4096
n_heads: 32
n_layers: 32
mlp_hidden_size: 22016
weight_tying: false
alibi: false
rope: true
rope_theta: 500000
flash_attention: true
attention_dropout: 0.0
include_bias: false
block_type: sequential
layer_norm_type: rms
layer_norm_with_affine: true
layer_norm_eps: 1e-6
bias_for_layer_norm: false
attention_layer_norm: true
attention_layer_norm_with_affine: true
norm_after: true
activation_type: swiglu
residual_dropout: 0.0
embedding_dropout: 0.0
max_sequence_length: 4096
vocab_size: 100278
embedding_size: 100352
eos_token_id: 100257
pad_token_id: 100277
init_device: meta
init_fn: normal
init_std: 0.02
init_cutoff_factor: 3

softmax_auxiliary_loss: true
auxiliary_loss_multiplier: 1e-5
fused_loss: true

compile: null

optimizer:
name: adamw
learning_rate: 0.000061499
weight_decay: 0.1
eps: 1e-8
decay_norm_and_bias: true
decay_embeddings: false
betas:
- 0.9
- 0.95
metrics_log_interval: 1

scheduler:
name: linear_with_warmup
t_warmup: 0
alpha_f: 0

tokenizer:
identifier: tokenizers/allenai_dolma2.json
truncate_direction: right

save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-microanneals/${run_name}
save_overwrite: false

save_interval: 1000
save_interval_ephemeral: 250
save_num_checkpoints_to_keep: -1
sharded_checkpointer: olmo_core

save_interval_unsharded: null
save_num_unsharded_checkpoints_to_keep: -1

load_path: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7/step928646

restore_dataloader: false
no_pre_train_checkpoint: true

max_duration: 1ep
# stop_at: 11931 # Relying on max_duration for anneals
global_train_batch_size: 1024
device_train_microbatch_size: 2

precision: amp_bf16

fsdp:
wrapping_strategy: by_block_and_size
precision: mixed

activation_checkpointing: one_in_four

max_grad_norm: 1.0
max_grad_norm_ratio: null

speed_monitor:
window_size: 1

gen1_gc_interval: 1

eval_interval: 1000
eval_subset_num_batches: -1
device_eval_batch_size: ${device_train_microbatch_size}
evaluators:
# - label: all-small-ppl-validation
# data:
# num_workers: 0
# drop_last: true
# # generate_doc_lengths: true
# memmap_dtype: uint32
# datasets:
# c4_en-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy
# dolma_books-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy
# dolma_common-crawl-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy
# dolma_pes2o-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy
# dolma_reddit-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy
# dolma_stack-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy
# dolma_wiki-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy
# ice-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy
# m2d2_s2orc-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy
# pile-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy
# wikitext_103-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy

##########################
# Downstream evaluations #
##########################
- label: piqa
type: downstream

- label: hellaswag
type: downstream

- label: winogrande
type: downstream

- label: openbook_qa
type: downstream

- label: boolq
type: downstream

- label: sciq
type: downstream

- label: arc_easy
type: downstream

- label: arc_challenge
type: downstream

- label: copa
type: downstream

#- label: rte
# type: downstream

#- label: commitment_bank
# type: downstream

#- label: sst2
# type: downstream

- label: commonsense_qa
type: downstream

- label: social_iqa
type: downstream

- label: mmlu_stem_var
type: downstream

- label: mmlu_humanities_var
type: downstream

- label: mmlu_social_sciences_var
type: downstream

- label: mmlu_other_var
type: downstream

- label: mmlu_stem_mc_5shot
type: downstream

- label: mmlu_humanities_mc_5shot
type: downstream

- label: mmlu_social_sciences_mc_5shot
type: downstream

- label: mmlu_other_mc_5shot
type: downstream

- label: mmlu_stem_mc_5shot_test
type: downstream

- label: mmlu_humanities_mc_5shot_test
type: downstream

- label: mmlu_social_sciences_mc_5shot_test
type: downstream

- label: mmlu_other_mc_5shot_test
type: downstream

- label: basic_arithmetic
type: downstream

- label: trivia_qa_wiki_ppl
type: downstream

- label: natural_qs_open_ppl
type: downstream

- label: arc_easy_ppl
type: downstream

data:
pad_direction: right
# generate_doc_lengths: true
num_workers: 32
drop_last: true
pin_memory: true
prefetch_factor: 8
persistent_workers: true
memmap_dtype: uint32
timeout: 0
instance_filter:
repetition_max_period: 13
repetition_min_period: 1
repetition_max_count: 32
paths:
#SOURCE: s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/ (5.23BT)
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-38-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-02-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-64-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-48-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-76-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-11-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-35-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-47-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-83-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-04-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-15-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-46-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-22-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-61-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-44-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-43-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-63-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-74-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-45-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-53-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-14-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-80-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-54-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-32-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-75-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-72-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-86-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-30-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-23-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-03-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-62-00000.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/automathtext/part-26-00000.npy
#SOURCE: s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2 (6.07BT)
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-04-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-24-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-24-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-26-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-08-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-25-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-53-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-15-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-07-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-45-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-31-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-57-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-09-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0023/part-04-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-63-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy
Loading

0 comments on commit 1048c16

Please sign in to comment.