diff --git a/configs/annealing/v1.7-step_2T-warmup_true-steps_50B-flan_false.yaml b/configs/annealing/v1.7-step_2T-warmup_true-steps_50B-flan_false.yaml new file mode 100644 index 000000000..a3001de8d --- /dev/null +++ b/configs/annealing/v1.7-step_2T-warmup_true-steps_50B-flan_false.yaml @@ -0,0 +1,323 @@ +run_name: v1.7-step_2T-warmup_true-steps_50B-flan_false.yaml +seed: 61394 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-annealing + group: v1.7-step_2T-warmup_true-steps_50B-flan_false.yaml + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + # mlp_ratio: 6 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + attention_layer_norm: false + multi_query_attention: false + include_bias: false + block_type: sequential + layer_norm_type: default + layer_norm_with_affine: false + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 50280 + embedding_size: 50304 + eos_token_id: 0 + pad_token_id: 1 + init_device: meta + init_fn: mitchell + +compile: null + +optimizer: + name: adamw + learning_rate: 1.5e-4 # This is half the max LR from official run. + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: linear_with_warmup + t_warmup: 1000 + alpha_f: 0.1 + +tokenizer: + identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json + truncate_direction: right + +save_folder: /data +save_overwrite: false +remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} +# Sharded checkpoints (best for restarts) +save_interval: 500 +save_num_checkpoints_to_keep: -1 +# Unsharded checkpoints (for final storage) +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +restore_dataloader: false + +# 2T token checkpoint for new 7B model; we call this v1.7 to match the Dolma data. +load_path: s3:///ai2-llm/checkpoints/OLMo-medium/mitchish7/step477000-unsharded + +no_pre_train_checkpoint: true +reset_optimizer_state: true # These both are false when resetting.. +reset_trainer_state: true + +max_duration: 50e9T +global_train_batch_size: 3072 +device_train_microbatch_size: 3 +time_limit: null + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + sharding_strategy: SHARD_GRAD_OP + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 20 + +eval_interval: ${save_interval} +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + # pin_memory: true + # prefetch_factor: 1 + # persistent_workers: false + # timeout: 0 + datasets: + 4chan-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy + c4_100_domains-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy + gab-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy + m2d2_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy + manosphere-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy + mc4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy + ptb-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy + twitterAEE-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: mmlu_stem + type: downstream + + - label: mmlu_humanities + type: downstream + + - label: mmlu_social_sciences + type: downstream + + - label: mmlu_other + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + #- label: copa + # type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: mrpc + # type: downstream + + #- label: sst2 + # type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 1 + persistent_workers: true + timeout: 0 + paths: + ######### NON WEB DATA ######### + # ~> GUTENBERG BOOKS (5.256 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy + # ~> PES2O STEM PAPERS (9.5 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy + # ~> REDPAJAMA STACKEXCHANGE (9.4 GT) + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/redpajama_stackexchange_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + # ~> REDPAJAMA ARXIV (11.3 GT) + - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy + - s3://ai2-llm/preprocessed/redpajama_arxiv_only/v1_decontaminated/gpt-neox-olmo-dolma-v1_5/part-04-00002.npy + # ~> PROOFPILE2 ALGEBRAIC STACK (9.7 GT) + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + #################################### + ######### CODE ######### + # ~> STARCODER (11.5 GT) + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + #################################### + ######### WEB HIGH QUALITY ######### + # ~> C4 (9.85 GT) + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy + # ~> REDDIT (10.4 GT) + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + # ~> FALCON (11.9 GT) + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy diff --git a/configs/annealing/v1.7_step_2T-warmup_true-steps_50B.yaml b/configs/annealing/v1.7_step_2T-warmup_true-steps_50B.yaml index 8705ac054..937b3356f 100644 --- a/configs/annealing/v1.7_step_2T-warmup_true-steps_50B.yaml +++ b/configs/annealing/v1.7_step_2T-warmup_true-steps_50B.yaml @@ -61,7 +61,7 @@ save_folder: /data save_overwrite: false remote_save_folder: s3://ai2-llm/checkpoints/davidw/annealing/${run_name} # Sharded checkpoints (best for restarts) -save_interval: 200 +save_interval: 500 save_num_checkpoints_to_keep: -1 # Unsharded checkpoints (for final storage) save_interval_unsharded: null @@ -185,6 +185,18 @@ evaluators: - label: mmlu_other_var type: downstream + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + #- label: copa # type: downstream diff --git a/scripts/beaker/annealing/v1.7-step_2T-warmup_true-steps_50B-flan_false.sh b/scripts/beaker/annealing/v1.7-step_2T-warmup_true-steps_50B-flan_false.sh new file mode 100755 index 000000000..053b6f747 --- /dev/null +++ b/scripts/beaker/annealing/v1.7-step_2T-warmup_true-steps_50B-flan_false.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +set -ex + +CONFIG_PATH=configs/annealing/v1.7-step_2T-warmup_true-steps_50B-flan_false.yaml +NUM_NODES=3 + +gantry run \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --task-name v1.7-step_2T-warmup_true-steps_50B-flan_false \ + --description v1.7-step_2T-warmup_true-steps_50B-flan_false \ + --priority high \ + --beaker-image petew/olmo-torch2-gantry \ + --cluster ai2/general-cirrascale-a100-80g-ib \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --nfs \ + --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ + --budget ai2/oe-training \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env-secret WANDB_API_KEY=WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ + --env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \ + --env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \ + --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ + --shared-memory 10GiB \ + --venv base \ + --yes \ + -- /bin/bash -c "source scripts/beaker/warm_hf_cache.sh && torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH}" diff --git a/scripts/beaker/annealing/v1.7-step_2T-warmup_true-steps_50B.sh b/scripts/beaker/annealing/v1.7-step_2T-warmup_true-steps_50B.sh index 414a5e0a9..5337efc94 100755 --- a/scripts/beaker/annealing/v1.7-step_2T-warmup_true-steps_50B.sh +++ b/scripts/beaker/annealing/v1.7-step_2T-warmup_true-steps_50B.sh @@ -3,7 +3,7 @@ set -ex CONFIG_PATH=configs/annealing/v1.7-step_2T-warmup_true-steps_50B.yaml -NUM_NODES=4 +NUM_NODES=3 gantry run \ --allow-dirty \ @@ -12,7 +12,7 @@ gantry run \ --description v1.7-step_2T-warmup_true-steps_50B \ --priority high \ --beaker-image petew/olmo-torch2-gantry \ - --cluster ai2/general-cirrascale-a100-80g-ib \ + --cluster ai2/pluto-cirrascale \ --gpus 8 \ --replicas "${NUM_NODES}" \ --leader-selection \